In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn import load_dataset
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model, metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

#this method will help us to #download the Titanic dataset
#%matplotlib inline #if you use jupyter notebook
plt.style.use('ggplot') #check for more with plt.style.available
sns.set_theme(color_codes=True)

In [2]:
df = load_dataset("titanic")

# Data Manipulation

In [3]:
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [4]:
df.dropna(subset=['embarked','embark_town'],inplace=True)
df.drop(['embarked','deck','alive'],axis = 1,inplace = True) #embarked column is redundant column
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,who,adult_male,embark_town,alone
0,0,3,male,22.0,1,0,7.25,Third,man,True,Southampton,False
1,1,1,female,38.0,1,0,71.2833,First,woman,False,Cherbourg,False
2,1,3,female,26.0,0,0,7.925,Third,woman,False,Southampton,True
3,1,1,female,35.0,1,0,53.1,First,woman,False,Southampton,False
4,0,3,male,35.0,0,0,8.05,Third,man,True,Southampton,True


In [5]:
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
embark_town      0
alone            0
dtype: int64

In [6]:
df.shape

(889, 12)

## Find age values projected by Regression

In [7]:
# Get one hot encoding of Cat.columns
df_encode = pd.get_dummies(df,columns =['sex','class','who','embark_town','adult_male','alone'],dtype = float)
df_encode.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,sex_female,sex_male,class_First,class_Second,...,who_child,who_man,who_woman,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,adult_male_False,adult_male_True,alone_False,alone_True
0,0,3,22.0,1,0,7.25,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1,1,1,38.0,1,0,71.2833,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2,1,3,26.0,0,0,7.925,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,1,1,35.0,1,0,53.1,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,0,3,35.0,0,0,8.05,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


In [8]:
#Move age column to the end
age = df_encode['age']
df_encode.drop(labels=['age'], axis=1,inplace = True)
df_encode.insert(20, 'age', age)
df_encode.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,sex_female,sex_male,class_First,class_Second,class_Third,...,who_man,who_woman,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,adult_male_False,adult_male_True,alone_False,alone_True,age
0,0,3,1,0,7.25,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,22.0
1,1,1,1,0,71.2833,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,38.0
2,1,3,0,0,7.925,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,26.0
3,1,1,1,0,53.1,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,35.0
4,0,3,0,0,8.05,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,35.0


In [9]:
#get dataframe without NaN value in 'age'
df_train = df_encode.dropna(subset=['age'])
df_train.shape

(712, 21)

In [10]:
#get dataframe with NaN value in 'age'
df_test = df_encode[df_encode['age'].isna()]
df_test.shape

(177, 21)

In [11]:
#Training df
X_train = df_train.iloc[:, :-1]
y_train = df_train[['age']]

In [12]:
#Test df
X_test = df_test.iloc[:, :-1]
y_test = df_test[['age']]

In [13]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train) ## model train

In [14]:
y_pred = regressor.predict(X_test)

## Predicted age value back to the Dataset

In [15]:
df_test.drop(labels=['age'], axis=1,inplace = True)
df_test.insert(20, 'age', y_pred)
df_test.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.drop(labels=['age'], axis=1,inplace = True)


Unnamed: 0,survived,pclass,sibsp,parch,fare,sex_female,sex_male,class_First,class_Second,class_Third,...,who_man,who_woman,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,adult_male_False,adult_male_True,alone_False,alone_True,age
5,0,3,0,0,8.4583,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,32.820806
17,1,2,0,0,13.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,31.763657
19,1,3,0,0,7.225,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,25.446943
26,0,3,0,0,7.225,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,29.707114
28,1,3,0,0,7.8792,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,28.577995


In [16]:
df_test.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,sex_female,sex_male,class_First,class_Second,class_Third,...,who_man,who_woman,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,adult_male_False,adult_male_True,alone_False,alone_True,age
5,0,3,0,0,8.4583,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,32.820806
17,1,2,0,0,13.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,31.763657
19,1,3,0,0,7.225,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,25.446943
26,0,3,0,0,7.225,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,29.707114
28,1,3,0,0,7.8792,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,28.577995


In [17]:
frames = [df_train,df_test]
cl_df = pd.concat(frames)
cl_df.shape

(889, 21)

In [18]:
X = cl_df.iloc[:,1:]
y = cl_df['survived']

In [19]:
y.shape

(889,)

In [1]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
tree_clf = DecisionTreeClassifier(max_depth=3)
tree_clf.fit(X_train, y_train)

NameError: name 'train_test_split' is not defined

In [26]:
y_pred = tree_clf.predict(X_test)
y_pred

array([0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1], dtype=int64)

In [27]:
accuracy_score(y_pred, y_test)

0.8651685393258427

In [29]:
print(confusion_matrix(y_pred, y_test))

[[101   9]
 [ 15  53]]


In [30]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.87      0.92      0.89       110
           1       0.85      0.78      0.82        68

    accuracy                           0.87       178
   macro avg       0.86      0.85      0.85       178
weighted avg       0.86      0.87      0.86       178

