In [49]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [50]:
train_data = pd.read_csv('C:/Users/Msi/PycharmProjects/KaggleProjects/Titanic/data/train.csv')
test_data = pd.read_csv('C:/Users/Msi/PycharmProjects/KaggleProjects/Titanic/data/test.csv')

In [51]:
train_data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [52]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [53]:
# Preprocessing data
X = train_data.drop(['PassengerId', 'Survived', 'Name','Ticket', 'Cabin'], axis=1)
y = train_data['Survived']
X = X.copy()
X = pd.get_dummies(X, columns=['Sex', 'Embarked'], drop_first=True) # to make binary columns into numerical
# handle missing values
X = X.assign(Age=X['Age'].fillna(X['Age'].mean()),
             Fare=X['Fare'].fillna(X['Fare'].mean()))


X_test = test_data.drop(['PassengerId', 'Name','Ticket', 'Cabin'], axis=1)
X_test = X_test.copy()
X_test = pd.get_dummies(X_test, columns=['Sex', 'Embarked'], drop_first=True)
X_test = X_test.assign(Age=X_test['Age'].fillna(X_test['Age'].mean()),
             Fare=X_test['Fare'].fillna(X_test['Fare'].mean()))



X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
models = [
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    LogisticRegression(random_state=42)
]

for model in models:
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    accuracy = accuracy_score(y_val, predictions)
    print(f"{type(model).__name__} Accuracy: {accuracy}")

DecisionTreeClassifier Accuracy: 0.7932960893854749
RandomForestClassifier Accuracy: 0.8044692737430168
GradientBoostingClassifier Accuracy: 0.8156424581005587
LogisticRegression Accuracy: 0.7988826815642458


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
tions = model.predict(X_val)
accuracy = accuracy_score(y_val, predictions)
print("Accuracy: ", accuracy)

Accuracy:  0.7932960893854749


In [45]:
for col in X_train.columns:
    if col not in X_test.columns:
        print("Column", col, "is not in X_test")
        X_test[col] = 0

In [46]:
test_predictions = model.predict(X_test)
submission_file = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': test_predictions})
submission_file.to_csv('C:/Users/Msi/PycharmProjects/KaggleProjects/Titanic/data/submission.csv', index=False)