# Loading exploratory data analysis packages
Exploratory data analysis(EDA) packages are used for data analysis and data manipulation. We shall use pandas to read our dataset and numpy to perform mathematical computations.

In [107]:
import numpy as np
import pandas as pd

In [108]:
#We use pandas to load our dataset.
dataset_Textfilter= pd.read_csv('dataset_textfilter.csv', encoding='utf8')

# Checking data structure
We check the structure to be able to see the available columns in our dataset.

In [109]:
dataset_Textfilter.head()

Unnamed: 0,Input,Area (Feets),Rooms,Bath Room,Living Rooms,Kitchen
0,Area is 1000 square feet. I want 1 bedroom. 1 ...,1000,1,1,1,1
1,It is 1500 square feet. I want to. Two bedroom...,1500,2,1,1,1
2,Area is 1300 square feet. I want to. three bed...,1300,3,1,1,1
3,Area is 900 square feet. two Bedroom. Two wash...,900,2,2,1,1
4,Area is 1000 square feet. I want three bedroom...,1000,3,3,1,1


# Datatype of our labels
We check the data type of our labels as they need to have a uniform data type.

In [110]:
dataset_Textfilter.dtypes

Input           object
Area (Feets)     int64
Rooms            int64
Bath Room        int64
Living Rooms     int64
Kitchen          int64
dtype: object

# Loading machine learning packages

In [111]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,hamming_loss
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Importing problem transformation packages
We will use the problem transformation packages to handle the three techniques.

# import sys

! {sys.executable} -m pip install scikit-multilearn

In [112]:
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset

# Text preprocessing
Text preprocessing involves removing stop words and removal of noisy data. Stop words are a list of all words that are common in a given language.

Stop words are removed since they do not have a high classification power during predictive analysis. They tend to bring bias when building the classifier.

We remove noisy data that affect our model during training and bring errors in our model.

To perform text preprocessing, we need to install the neattext package. Neattext is a Python package used for textual data cleaning and text preprocessing.

# import sys
! {sys.executable} -m pip install neattext

In [113]:
import neattext as nt
import neattext.functions as nfx

# Exploring the dataset for noise
Noise is the unwanted character in our dataset that may affect our model during training.

In [114]:
dataset_Textfilter['Input'].apply(lambda x:nt.TextFrame(x).noise_scan())

0     {'text_noise': 10.0, 'text_length': 90, 'noise...
1     {'text_noise': 14.583333333333334, 'text_lengt...
2     {'text_noise': 13.0, 'text_length': 100, 'nois...
3     {'text_noise': 10.588235294117647, 'text_lengt...
4     {'text_noise': 12.244897959183673, 'text_lengt...
5     {'text_noise': 14.583333333333334, 'text_lengt...
6     {'text_noise': 10.638297872340425, 'text_lengt...
7     {'text_noise': 10.588235294117647, 'text_lengt...
8     {'text_noise': 10.588235294117647, 'text_lengt...
9     {'text_noise': 12.5, 'text_length': 96, 'noise...
10    {'text_noise': 12.76595744680851, 'text_length...
11    {'text_noise': 12.76595744680851, 'text_length...
12    {'text_noise': 13.043478260869565, 'text_lengt...
13    {'text_noise': 10.0, 'text_length': 90, 'noise...
14    {'text_noise': 14.14141414141414, 'text_length...
15    {'text_noise': 12.76595744680851, 'text_length...
16    {'text_noise': 10.227272727272728, 'text_lengt...
17    {'text_noise': 14.583333333333334, 'text_l

# We explore the noise data from the Input column. It shows all the rows containing noise words from the first row 0 to 39 and the noise data in these rows. The first value is 10, and the last is 14. We can now extract the stop words from these noisy data.

# Extracting stop words
We use the TextExtractor() and extract_stopwords() methods to extract all the stop words available in our title column.

In [115]:
# dataset_Textfilter['Input'].apply(lambda x:nt.TextExtractor(x).extract_stopwords())
#The output of the stop words is shown:

# Removing stop words
We remove stop words using the nfx.remove_stopwords function as shown:

In [116]:
# dataset_Textfilter['Input'].apply(nfx.remove_stopwords)

# Saving dataset in a variable

In [117]:
corpus = dataset_Textfilter['Input']

# using the TfidfVectorizer() package to conduct feature extraction

In [118]:
tfidf = TfidfVectorizer()

# Extracting features
The features will be numeric values as stated above. The features will be used as input for the model during training and predictive analysis.

In [119]:
Xfeatures = tfidf.fit_transform(corpus).toarray()

# showing array of features

In [120]:
Xfeatures

array([[0.5144811 , 0.        , 0.        , ..., 0.20681713, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.1722858 , 0.        ,
        0.        ],
       [0.        , 0.        , 0.459373  , ..., 0.17227965, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.16863856, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.35790026,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.16997643, 0.        ,
        0.        ]])