# Table of contents:

 ### - Import libraries and datasets
 ### - Importing the dataset
 ### - Dataset Summary
    1. Closer Examination of the Target Variable, Sentiment
 ### - Data Preprocessing and Engineering
    1. Create copy
    2. Hashtag extraction
    3. message cleaning
    4. Parts of speech tagging and lemmatization
    5. Word frequency
 ### - Exploratory data analysis
    1. Target variable distribution
    2. message length distribution
    3. Most common words
    4. Hashtags
 ### - Building classification models
    1. Train-validation split
    2. Pipelines
    3. Train models
 ### - Model evaluation
    1. Random forest
    2. Naive Bayes
    3. K nearest neighbors
    4. Logistic regression
    5. Linear SVC
 ### - Model Selection
 ### - Hyperparameter tuning


 

# Import libraries and datasets

Initial step is to load libraries that we intend to use in the notebook . 

In [0]:


# Packages for data processinag
import numpy as np
import pandas as pd
import datetime
from sklearn import preprocessing
from sklearn.manifold import TSNE
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from scipy.sparse import csr_matrix
import scipy as sp
from skimpy import skim

# Package for timing code
import timeit
import time
import datetime


# Packages for visualization
import matplotlib
import seaborn as sns 
%matplotlib inline
import matplotlib.pyplot as plt 

# Style
import matplotlib.style as style 
sns.set(font_scale=1.5)
style.use('seaborn-pastel')
sns.set(style="whitegrid")
sns.set_style("dark")

# Packages for modeling
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_error, mean_absolute_error
#from surprise import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
#from surprise.model_selection import cross_validate
#from surprise.model_selection import GridSearchCV
#from surprise import SVD,NormalPredictor
#from surprise import SVDpp
#from surprise import NMF
#from surprise import SlopeOne
#from surprise import CoClustering,BaselineOnly
from sklearn.neighbors import  NearestNeighbors
import heapq

# Performance Evaluation
#from surprise import accuracy
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
#from surprise.model_selection import GridSearchCV, cross_validate, train_test_split
from time import time

# Package to suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Packages for saving models
import pickle

#import cufflinks as cf
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# Packages for Randomisation
import random
%matplotlib inline

sns.set(style='whitegrid', palette='muted',
        rc={'figure.figsize': (15,10)})

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

  style.use('seaborn-pastel')


# Importing the dataset

1. Train.csv:  Dataset that contains all the variables that should be used to train the model

2. Test.csv :  Dataset that contains variables that will be used to test the model

In [1]:
#load the training and test data set
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [2]:
#The first five rows of the traing dataset
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Dataset summary
An overview of the raw data

In [3]:
# View the first 5 rows of the test data
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


**Step 1**: We will take a look at the shape of the dataframe to see the amount of data we are working with, the **rows** and the **columns**

In [4]:
#checking the shape of the training dataframe
train.shape
print("The training dataset has {0} rows and {1} columns".format(train.shape[0], train.shape[1]))

The training dataset has 1460 rows and 81 columns


<b>Step 2:</b> let's take a look at the data types in the dataframe using `pd.info()` to get more information about the dataframe

In [13]:
#checking the information of the dataframe
skim(train)

<b>Step 3:</b> Let's check for any null values in the Train & Test dataset

In [6]:
# Check for null values in the training data
null_counts = train.isnull().sum()

# Print all the null counts
for column, null_count in null_counts.items():
    print(f'{column}: {null_count}')

Id: 0
MSSubClass: 0
MSZoning: 0
LotFrontage: 259
LotArea: 0
Street: 0
Alley: 1369
LotShape: 0
LandContour: 0
Utilities: 0
LotConfig: 0
LandSlope: 0
Neighborhood: 0
Condition1: 0
Condition2: 0
BldgType: 0
HouseStyle: 0
OverallQual: 0
OverallCond: 0
YearBuilt: 0
YearRemodAdd: 0
RoofStyle: 0
RoofMatl: 0
Exterior1st: 0
Exterior2nd: 0
MasVnrType: 8
MasVnrArea: 8
ExterQual: 0
ExterCond: 0
Foundation: 0
BsmtQual: 37
BsmtCond: 37
BsmtExposure: 38
BsmtFinType1: 37
BsmtFinSF1: 0
BsmtFinType2: 38
BsmtFinSF2: 0
BsmtUnfSF: 0
TotalBsmtSF: 0
Heating: 0
HeatingQC: 0
CentralAir: 0
Electrical: 1
1stFlrSF: 0
2ndFlrSF: 0
LowQualFinSF: 0
GrLivArea: 0
BsmtFullBath: 0
BsmtHalfBath: 0
FullBath: 0
HalfBath: 0
BedroomAbvGr: 0
KitchenAbvGr: 0
KitchenQual: 0
TotRmsAbvGrd: 0
Functional: 0
Fireplaces: 0
FireplaceQu: 690
GarageType: 81
GarageYrBlt: 81
GarageFinish: 81
GarageCars: 0
GarageArea: 0
GarageQual: 81
GarageCond: 81
PavedDrive: 0
WoodDeckSF: 0
OpenPorchSF: 0
EnclosedPorch: 0
3SsnPorch: 0
ScreenPorch: 0
Pool

In [7]:
# Count unique values in all columns
unique_counts = train.nunique()

print("Unique value counts in each column:")
print(unique_counts)

Unique value counts in each column:
Id               1460
MSSubClass         15
MSZoning            5
LotFrontage       110
LotArea          1073
                 ... 
MoSold             12
YrSold              5
SaleType            9
SaleCondition       6
SalePrice         663
Length: 81, dtype: int64


In [8]:
# Identify columns with null values
columns_with_nulls = train.columns[train.isnull().any()].tolist()

# Count unique values in columns with null values
unique_counts_with_nulls = train[columns_with_nulls].nunique()

print("Unique value counts in columns with null values:")
print(unique_counts_with_nulls)

Unique value counts in columns with null values:
LotFrontage     110
Alley             2
MasVnrType        4
MasVnrArea      327
BsmtQual          4
BsmtCond          4
BsmtExposure      4
BsmtFinType1      6
BsmtFinType2      6
Electrical        5
FireplaceQu       5
GarageType        6
GarageYrBlt      97
GarageFinish      3
GarageQual        5
GarageCond        5
PoolQC            3
Fence             4
MiscFeature       4
dtype: int64


In [9]:
# View columns of the DataFrame
print("Columns of the DataFrame:")
print(train.columns)

Columns of the DataFrame:
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
      

## Data Engineering

In [17]:
## Here we will check the percentage of nan values present in each feature
## 1 -step make the list of features which has missing values
features_with_na=[features for features in train.columns if train[features].isnull().sum()>1]
## 2- step print the feature name and the percentage of missing values

for feature in features_with_na:
    print(feature, np.round(train[feature].isnull().mean(), 4),  ' % missing values')

LotFrontage 0.1774  % missing values
Alley 0.9377  % missing values
MasVnrType 0.0055  % missing values
MasVnrArea 0.0055  % missing values
BsmtQual 0.0253  % missing values
BsmtCond 0.0253  % missing values
BsmtExposure 0.026  % missing values
BsmtFinType1 0.0253  % missing values
BsmtFinType2 0.026  % missing values
FireplaceQu 0.4726  % missing values
GarageType 0.0555  % missing values
GarageYrBlt 0.0555  % missing values
GarageFinish 0.0555  % missing values
GarageQual 0.0555  % missing values
GarageCond 0.0555  % missing values
PoolQC 0.9952  % missing values
Fence 0.8075  % missing values
MiscFeature 0.963  % missing values


### 1. MSSubClass

In [10]:
nulls_in_MSSubClass = train['MSSubClass'].isnull().sum()

print(f"Number of null values in 'MSSubClass': {nulls_in_MSSubClass}")

Number of null values in 'MSSubClass': 0


In [11]:
unique_values = train['MSSubClass'].unique()

print(f"Unique values in 'MSSubClass': {unique_values}")

Unique values in 'MSSubClass': [ 60  20  70  50 190  45  90 120  30  85  80 160  75 180  40]


There no missing entries and ther are no incorrect entries. 

### 2. MSZoning

In [12]:
nulls_in_MSZoning = train['MSZoning'].isnull().sum()

print(f"Number of null values in : {nulls_in_MSZoning}")

Number of null values in : 0


In [13]:
unique_values_MSZoning = train['MSZoning'].unique()

print(f"Unique values in MSZoning: {unique_values_MSZoning}")

Unique values in MSZoning: ['RL' 'RM' 'C (all)' 'FV' 'RH']


There no missing entries and ther are no incorrect entries.

### 3. LotFrontage

In [14]:
nulls_in_LotFrontage = train['LotFrontage'].isnull().sum()

print(f"Number of null values in LotFrontage: {nulls_in_LotFrontage}")

Number of null values in LotFrontage: 259


In [15]:
LotConfig = train['LotConfig'].unique()

print(f"Unique values in LotConfig: {LotConfig}")

Unique values in LotConfig: ['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']
