In [67]:
#import pandas
import pandas as pd

In [68]:
#read housing data
housing = pd.read_csv (r'data/house.csv')

In [69]:
#make a copy of following features
housing_df = housing[['housing_median_age', 'total_rooms',
        'total_bedrooms', 'population', 'households', 'median_income',
        'median_house_value','ocean_proximity']].copy()
housing_df.head()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


<font color = purple> The usual missing value imputation routine please

In [70]:
# check for missing values
housing_df['total_bedrooms'].isna().value_counts()
# fill misisng values with median
median_tb = housing['total_bedrooms'].median()
# fill the na's with median
housing_df['total_bedrooms'].fillna(median_tb,inplace = True)
housing_df['total_bedrooms'].isna().value_counts()

False    20640
Name: total_bedrooms, dtype: int64

## <font color = blue> Label Encoding

<font color = purple> instead of mapping the categorical variable<br>
    use label encoder from sklearn

In [71]:
#import preprocessing from sklearn
from sklearn import preprocessing

In [72]:
# create a variable called lab_enc which is labelEncoder
lab_enc = preprocessing.LabelEncoder()

In [73]:
# get unique labels we want to change
lab_enc.fit(housing_df['ocean_proximity'].unique())
# check the unique values
housing_df['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [74]:
# check unique values to the ones above
list(lab_enc.classes_)

['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']

In [75]:
housing_df['ocean_proximity'].head()

0    NEAR BAY
1    NEAR BAY
2    NEAR BAY
3    NEAR BAY
4    NEAR BAY
Name: ocean_proximity, dtype: object

In [76]:
housing_df['ocean_proximity'].tail()

20635    INLAND
20636    INLAND
20637    INLAND
20638    INLAND
20639    INLAND
Name: ocean_proximity, dtype: object

In [77]:
# Now transform the data with the labelEncoder
housing_df['ocean_proximity'] = lab_enc.transform(housing_df['ocean_proximity'])

In [78]:
# check head again
housing_df['ocean_proximity'].head()

0    3
1    3
2    3
3    3
4    3
Name: ocean_proximity, dtype: int32

In [79]:
# check tail again
housing_df['ocean_proximity'].tail()

20635    1
20636    1
20637    1
20638    1
20639    1
Name: ocean_proximity, dtype: int32

<font color = purple> Now the usual scaling routine please 

In [80]:
# using StandardScaler as ss, scale the data
from sklearn.preprocessing import StandardScaler as ss
temp = housing_df[['housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value']].copy()
temp = ss().fit_transform(temp)

In [81]:
housing_df[['housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value']] = temp
housing_df.head()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,2.129631,3
1,-0.607019,2.04589,1.357143,0.861439,1.669961,2.332238,1.314156,3
2,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699,1.258693,3
3,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968,1.1651,3
4,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881,1.1729,3


<font color = purple> Create the feature set<br>
    that is: train and test splits<br>
    <br>
    
<font color = purple> note: Do not create separate dataframes from predictor variables and target variables<br>
<font color = purple> just provide the indices in train_Test_split method

In [82]:
# import train_test_split
from sklearn.model_selection import train_test_split
# set the train and test variables against *!=6
# with test being only 6 
x_housing_train, x_housing_test, y_housing_train, y_housing_test = train_test_split(housing_df.iloc[:, [0,1,2,3,4,5,7]]
, housing_df.iloc[:, [6]]
, test_size = 0.3, random_state = 123)

In [83]:
x_housing_train.head()
#x_housing_test.head()
#y_housing_train.head()
#y_housing_test.head()


Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
12364,-1.79889,1.020937,0.913633,1.399218,0.859123,0.042178,1
12271,-0.607019,0.911382,0.75149,0.660986,0.817273,-0.18169,1
19605,0.26702,-0.774566,-0.807949,-0.707747,-0.854099,-0.998997,1
10600,-1.639974,-0.276757,-0.447895,-0.49758,-0.417293,1.601573,0
45,1.856182,-0.449111,-0.278598,-0.62474,-0.307438,-0.628427,3


<font color = purple> now repeat for titanic dataset<br>
    
remember to use label encoder<br>
    
Also, try using pandas method drop to drop columns

In [84]:
# read titanic data
titanic = pd.read_csv (r'data/titanic.csv')
# make a copy
titanic_df = titanic.copy()
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [85]:
#drop rows where Embarked has missing values
titanic_df.dropna(subset = ['Embarked'], inplace = True)
titanic_df['Embarked'].isna().value_counts()
titanic_df=titanic_df.drop('Cabin', axis=1)
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          712 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 83.3+ KB


In [86]:
# impute the age using mean
mean_age = titanic['Age'].mean()
# fill the na's with median
titanic_df['Age'].fillna(median_tb,inplace = True)
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 83.3+ KB


In [87]:
# use label encoder on the following features: Sex 
lab_enc.fit(titanic_df['Sex'].unique())
list(lab_enc.classes_)

['female', 'male']

In [88]:
titanic_df['Sex'] = lab_enc.transform(titanic_df['Sex'])
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [89]:
# use label encoder on the following features: Sex 
lab_enc.fit(titanic_df['Embarked'].unique())
list(lab_enc.classes_)

['C', 'Q', 'S']

In [95]:
titanic_df['Embarked'] = lab_enc.transform(titanic_df['Embarked'])
titanic.info()

  mask &= (ar1 != a)


ValueError: y contains previously unseen labels: [0, 1, 2]

In [91]:
titanic_df['Embarked'].head()

0    2
1    0
2    2
3    2
4    2
Name: Embarked, dtype: int32

In [92]:
# test train split with 0.3 as test size, 123 as randome_state
x_titanic_train, x_titanic_test, y_titanic_train, y_titanic_test = train_test_split(titanic_df.iloc[:, [2,4,5,6,7,10]],

titanic_df.iloc[:,[1]],

test_size = 0.3, random_state = 123)

##  <font color = blue>Naive Bayes Classifiers

In [93]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

In [94]:
G_nb = GaussianNB().fit(x_titanic_train, y_titanic_train)

  return f(**kwargs)


In [None]:
# confusion matrix, accuracy score, precision score, recall score, f1_Score

<font color = purple> Now, from the titanic dataset, pick only those predictor variables,<br>
 which can be used to train multinomial Naive Bayes<br>
 use the same train test split, make copies, add a suffix _mnb wherever appropriate<br>
 compare this mnb classifier with G_nb classifier
 

## <font color = blue> Bagging

<font color = purple> First Apply on Decision Tree classifier

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

<font color = purple> base_estimator<br>
The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a decision tree.

In [None]:
tree_restricted = DecisionTreeClassifier(criterion = 'entropy', random_state = 123, max_depth = 4)

In [None]:
bagging = BaggingClassifier(tree_restricted, n_estimators = 100, max_samples = 0.8, random_state = 198)

In [None]:
#bagging.fit(x_titanic_train, y_titanic_train)

In [None]:
# sanity check for our skeleton
bagging.base_estimator_

In [None]:
bagging.n_features_

In [None]:
bagging.estimators_

In [None]:
# calculate score for train model (slight overfit)
bagging.score(x_titanic_train, y_titanic_train)

In [None]:
# calculate score for test model (slight overfit)
bagging.score(x_titanic_test, y_titanic_test)

<font color = purple> extract feature importance<br>
 then plot a bar graph

In [None]:
# calculate the mean for all features used within bagging features.
# score is the importance
import numpy as np
feature_importances = np.mean([tree.feature_importances_ for tree in bagging.estimators_], axis = 0)
feature_importances

In [None]:
import matplotlib as plt
%matplotlib inline

In [None]:
feature_importances_series = pd.Series(feature_importances, index = ['Pclass', 'Sex','Age','Sibsp','Parch', 'Embarked'])
feature_importances_series.plot(kind= 'barh')
# add a title

## <font color = blue> Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = 

In [None]:
# alot of over fitting
rf.score(x_titanic_train, y_titanic_train)

In [None]:
# alot of over fitting
rf.score(x_titanic_test, y_titanic_test)

<font color = purple> now change the values for following parameters of random forest and create model rf2<br>
 criterion to entropy<br>
 max_features to 4<br>
 n_estimators to 50<br>
 max_depth to 4

##  <font color = blue> Support Vector Machines


In [None]:
# if you need to install
import sys
!{sys.executable} -m pip install nltk
import nltk

In [None]:
#read email data set (its sms data set actually)
sms = pd.read_csv('data/spam.csv', encoding='latin')
sms.info()

In [None]:
sms.head()

In [None]:
# get rid of garbage columns
sms = sms.loc[:'v1','v2',]
sms.head()

In [None]:
# rename appropriately 
# cat and text
sms.columns = [1,2]
sms.columns

In [None]:
sms['cat'].value_counts()

In [None]:
# label encoder
lab_enc.fit(sms['cat'].unique())

In [None]:
sms['cat'] - lab_enc.transform(sms['cat'])

<font color = purple> extract the frequency/count of each word<br>
   use CountVectorizer class

<font color = purple> Apply SVM<br>
 first linear kernel<br>
 then polynomial kernel<br> 

<font color = purple> Now try grid search to hyper tune polynomial svm's parameter (hyperparameters)<br>
 possible parameteres include <br>
 degree<br>
 gamma<br>
 C(regularisation parameter)<br>
 kernel<br>

<font color = purple> you are ready to compare NB and SVM