# Sklearn Models and Libraries


In [5]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [6]:
import pandas as pd
import re
import numpy as np

In [7]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!kaggle competitions download -c titanic

kaggle.json
Downloading test.csv to /content
  0% 0.00/28.0k [00:00<?, ?B/s]
100% 28.0k/28.0k [00:00<00:00, 13.0MB/s]
Downloading train.csv to /content
  0% 0.00/59.8k [00:00<?, ?B/s]
100% 59.8k/59.8k [00:00<00:00, 49.8MB/s]
Downloading gender_submission.csv to /content
  0% 0.00/3.18k [00:00<?, ?B/s]
100% 3.18k/3.18k [00:00<00:00, 2.66MB/s]


Read the dataset


In [8]:
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')
all_df = pd.concat([train_df,test_df])
combine = [train_df,test_df]

In [9]:
for dataset in combine:
  dataset['FamilySize'] = dataset['Parch'] + dataset['SibSp']

In [10]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [11]:
# we see that there is about 177 rows of ages are missing , and the majority of the Cabin are not there .

In [12]:
#we should actualy focus first on the discrete value , like 


In [13]:
train_df[['Survived','Pclass']].groupby('Pclass',as_index = False).mean().sort_values(by = 'Survived',ascending = False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [14]:
train_df[['Survived','Sex']].groupby('Sex',as_index = False).mean().sort_values(by = 'Survived',ascending = False)

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [15]:
train_df[['Survived','SibSp']].groupby('SibSp',as_index = True).mean().sort_values(by = 'Survived',ascending = False)

Unnamed: 0_level_0,Survived
SibSp,Unnamed: 1_level_1
1,0.535885
2,0.464286
0,0.345395
3,0.25
4,0.166667
5,0.0
8,0.0


In [16]:
train_df[['Survived','Parch']].groupby('Parch',as_index =True).mean().sort_values(by = 'Survived',ascending = False)

Unnamed: 0_level_0,Survived
Parch,Unnamed: 1_level_1
3,0.6
1,0.550847
2,0.5
0,0.343658
5,0.2
4,0.0
6,0.0


In [17]:
train_df[['Survived','FamilySize']].groupby('FamilySize',as_index = True).mean().sort_values(by = 'Survived',ascending = False)

Unnamed: 0_level_0,Survived
FamilySize,Unnamed: 1_level_1
3,0.724138
2,0.578431
1,0.552795
6,0.333333
0,0.303538
4,0.2
5,0.136364
7,0.0
10,0.0


In [18]:
for dataset in combine:
  dataset['haveSip'] = dataset['SibSp'].apply(lambda x : min(x,1))
  dataset['haveParch'] = dataset['Parch'].apply(lambda x : min(x,1))
  dataset['Alone'] =  dataset['haveParch'] | dataset['haveSip']

In [19]:
train_df[['Survived','haveParch']].groupby('haveParch',as_index = True).mean().sort_values(by = 'Survived',ascending = False)

Unnamed: 0_level_0,Survived
haveParch,Unnamed: 1_level_1
1,0.511737
0,0.343658


In [20]:
train_df[['Survived','haveSip']].groupby('haveSip',as_index = True).mean().sort_values(by = 'Survived',ascending = False)

Unnamed: 0_level_0,Survived
haveSip,Unnamed: 1_level_1
1,0.466431
0,0.345395


In [21]:
train_df[['Survived','Alone']].groupby('Alone',as_index = True).mean().sort_values(by = 'Survived',ascending = False)

Unnamed: 0_level_0,Survived
Alone,Unnamed: 1_level_1
1,0.50565
0,0.303538


In [22]:
def extract_title(name):
  return re.findall(r'[a-zA-Z]+\.',name)[0][0:-1]


In [23]:
for dataset in combine:
  dataset['Title'] = dataset['Name'].apply(extract_title)

In [24]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona','Ms','Mme','Mlle'], 'Rare')

In [25]:
for dataset in combine:
  mean_age = dataset['Age'].mean()
  std_age = dataset['Age'].std()
  null_sum = dataset['Age'].isnull().sum()
  random_values = np.random.randint(mean_age - std_age,mean_age + std_age,null_sum)
  age_values = dataset['Age'].copy()
  age_values[np.isnan(age_values)] = random_values
  dataset['Age'] = age_values.copy()

In [26]:
for dataset in combine:
  dataset['CutAge'] = dataset['Age']//(dataset['Age'].max()//5)

In [27]:
dataset['CutAge'].unique()

array([2., 3., 4., 1., 0., 5.])

In [28]:
train_df[['Survived','CutAge']].groupby('CutAge',as_index = True).mean().sort_values(by = 'Survived',ascending = False)

Unnamed: 0_level_0,Survived
CutAge,Unnamed: 1_level_1
5.0,1.0
0.0,0.576087
3.0,0.473684
2.0,0.377863
1.0,0.341518
4.0,0.0


In [29]:
for dataset in combine:
  dataset = dataset.drop(['PassengerId','Name','Cabin'],axis = 1)

In [30]:


train_df = train_df.drop(['PassengerId','Name','Cabin'],axis = 1)
test_df = test_df.drop(['PassengerId','Name','Cabin'],axis = 1)


In [31]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Sex         891 non-null    object 
 3   Age         891 non-null    float64
 4   SibSp       891 non-null    int64  
 5   Parch       891 non-null    int64  
 6   Ticket      891 non-null    object 
 7   Fare        891 non-null    float64
 8   Embarked    889 non-null    object 
 9   FamilySize  891 non-null    int64  
 10  haveSip     891 non-null    int64  
 11  haveParch   891 non-null    int64  
 12  Alone       891 non-null    int64  
 13  Title       891 non-null    object 
 14  CutAge      891 non-null    float64
dtypes: float64(3), int64(8), object(4)
memory usage: 104.5+ KB


In [32]:
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].mean())

In [33]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      418 non-null    int64  
 1   Sex         418 non-null    object 
 2   Age         418 non-null    float64
 3   SibSp       418 non-null    int64  
 4   Parch       418 non-null    int64  
 5   Ticket      418 non-null    object 
 6   Fare        418 non-null    float64
 7   Embarked    418 non-null    object 
 8   FamilySize  418 non-null    int64  
 9   haveSip     418 non-null    int64  
 10  haveParch   418 non-null    int64  
 11  Alone       418 non-null    int64  
 12  Title       418 non-null    object 
 13  CutAge      418 non-null    float64
dtypes: float64(3), int64(7), object(4)
memory usage: 45.8+ KB


In [34]:
from sklearn.ensemble import RandomForestClassifier
y = train_df["Survived"]
features = ["Pclass", 'Age',"Sex","FamilySize","Embarked","SibSp","Fare"]
X = pd.get_dummies(train_df[features])
X_test = pd.get_dummies(test_df[features])
model = RandomForestClassifier(n_estimators=100,max_depth = 6)
model.fit(X, y)
predictions = model.predict(X_test)
print(model.score(X,y))
submission = pd.read_csv('/content/gender_submission.csv')
submission['Survived'] = predictions
submission.to_csv('husamalsayed.csv', index=False)

0.8619528619528619
