In [1]:
# DATA IMPORT

import seaborn as sns
import pandas as pd

titanic_data = sns.load_dataset('titanic')
titanic_data.head()



Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [2]:
# DATA INVESTIGATION

titanic_data.isnull().sum()  # how many empty(Null) values in table


Unnamed: 0,0
survived,0
pclass,0
sex,0
age,177
sibsp,0
parch,0
fare,0
embarked,2
class,0
who,0


In [3]:
titanic_data.dtypes # what types data have

Unnamed: 0,0
survived,int64
pclass,int64
sex,object
age,float64
sibsp,int64
parch,int64
fare,float64
embarked,object
class,category
who,object


In [5]:
titanic_data.drop('deck', axis=1, inplace=True) #delete because too much null values
titanic_data.drop(['alive', 'alone', 'adult_male'], axis=1, inplace=True) # dont need those
titanic_data.drop(['class'], axis=1, inplace=True) # cant remove twice, get error


In [6]:
# DATA TRANSFORMATION

titanic_data['class'] = titanic_data['class'].astype(str) # modified type from category to string

titanic_data.fillna(0, inplace=True) # fill null values with 0

KeyError: 'class'

In [7]:
titanic_data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,embark_town
0,0,3,male,22.0,1,0,7.25,S,man,Southampton
1,1,1,female,38.0,1,0,71.2833,C,woman,Cherbourg
2,1,3,female,26.0,0,0,7.925,S,woman,Southampton
3,1,1,female,35.0,1,0,53.1,S,woman,Southampton
4,0,3,male,35.0,0,0,8.05,S,man,Southampton


In [8]:
titanic_data['sex'].unique() # get unique values

array(['male', 'female'], dtype=object)

In [9]:
titanic_data['who'].unique() # get unique values

array(['man', 'woman', 'child'], dtype=object)

In [10]:
# SEX column

sex_mapping = {'male': 0, 'female': 1} # specify new mapping
titanic_data['sex'] = titanic_data['sex'].map(sex_mapping) # replace values in existing df

titanic_data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,embark_town
0,0,3,0,22.0,1,0,7.25,S,man,Southampton
1,1,1,1,38.0,1,0,71.2833,C,woman,Cherbourg
2,1,3,1,26.0,0,0,7.925,S,woman,Southampton
3,1,1,1,35.0,1,0,53.1,S,woman,Southampton
4,0,3,0,35.0,0,0,8.05,S,man,Southampton


In [11]:
# OTHER columns to nums

# embark_town

titanic_data['embark_town'].unique() # check if we have 0 values inside

array(['Southampton', 'Cherbourg', 'Queenstown', 0], dtype=object)

In [12]:
embark_town_mapping = {'Southampton': 1, 'Queenstown': 3, 'Cherbourg': 2, 0:0} # specify new mapping , leave 0 as 0
titanic_data['embark_town'] = titanic_data['embark_town'].map(embark_town_mapping) # replace values in existing df

titanic_data.head()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,embark_town
0,0,3,0,22.0,1,0,7.25,S,man,1
1,1,1,1,38.0,1,0,71.2833,C,woman,2
2,1,3,1,26.0,0,0,7.925,S,woman,1
3,1,1,1,35.0,1,0,53.1,S,woman,1
4,0,3,0,35.0,0,0,8.05,S,man,1


In [13]:
# same 'embarked' and 'who'

titanic_data['embarked'].unique() # have 0
titanic_data['who'].unique() # dont have 0

array(['man', 'woman', 'child'], dtype=object)

In [14]:
embarked_mapping = {'S': 1, 'Q': 3, 'C': 2, 0:0} # specify new mapping , leave 0 as 0
titanic_data['embarked'] = titanic_data['embarked'].map(embarked_mapping) # replace values in existing df

who_mapping = {'man': 0, 'woman': 1, 'child': 2} # specify new mapping
titanic_data['who'] = titanic_data['who'].map(who_mapping) # replace values in existing df

titanic_data.head()
# can be done in one go w LabelEncoder sklearn

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,embark_town
0,0,3,0,22.0,1,0,7.25,1,0,1
1,1,1,1,38.0,1,0,71.2833,2,1,2
2,1,3,1,26.0,0,0,7.925,1,1,1
3,1,1,1,35.0,1,0,53.1,1,1,1
4,0,3,0,35.0,0,0,8.05,1,0,1


In [15]:
titanic_data.info() # check for types again

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    int64  
 3   age          891 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embarked     891 non-null    int64  
 8   who          891 non-null    int64  
 9   embark_town  891 non-null    int64  
dtypes: float64(2), int64(8)
memory usage: 69.7 KB


In [16]:
# BUILDING A DATA MODEL

from sklearn.model_selection import train_test_split

X = titanic_data.drop('survived', axis=1) # FEATURES ,keep whole set w out survived
y = titanic_data['survived'] # DEPENDENT VARIABLE keeping only survived



In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42) #random needed for shuffling, it is not a must


In [18]:
# LOGISTIC REGRESSION
#import-initialize-train

from sklearn.linear_model import LogisticRegression # import

model = LogisticRegression() # initialize
model.fit(X_train, y_train)  # train



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
prediction_results = model.predict(X_test) # predict # 0 - survived, 1- not survived ?
prediction_results

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,

In [20]:

#EASY: add DecisionTreeClassifier to titanic data predictions.

from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
prediction_results = model.predict(X_test)
prediction_results


array([0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1,

In [22]:
#HARD: Investigate what is cross-validation and implement cross-validation on any classification model you prefer on Titanic data.


from sklearn.neighbors import KNeighborsClassifier # class used to create and train a K-Nearest Neighbors model for classification tasks
from sklearn.model_selection import KFold, cross_val_score # 'KFold' for splitting data; 'cross_val_score' to evaluate model's performance

# Initializing the model
model_knn_cv = KNeighborsClassifier()

# Defining KFold cross-validator
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

# Evaluating the model using cross-validation
accuracy_scores = cross_val_score(model_knn_cv, X, y, cv=k_fold, scoring='accuracy')

# Output of the accuracy scores for each fold
print("Accuracy scores for each fold:", accuracy_scores)

# Output of the mean accuracy across all folds
print("Mean accuracy:", accuracy_scores.mean())


Accuracy scores for each fold: [0.73333333 0.68539326 0.6741573  0.66292135 0.61797753 0.65168539
 0.68539326 0.6741573  0.65168539 0.75280899]
Mean accuracy: 0.6789513108614231
