In [22]:
## import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [23]:
## Getting train data

titanic_test = pd.read_csv('test_6.csv')
titanic_test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [24]:
titanic_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


In [25]:
## Data Cleaning
titanic_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [26]:
## To identify rows in the DataFrame where at least one value is missing (NaN/null) across any column
titanic_test[titanic_test.isnull().any(axis = 1)]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
412,1304,3,"Henriksson, Miss. Jenny Lovisa",female,28.0,0,0,347086,7.7750,,S
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [27]:
# Identify rows with missing values in the "Age" column
missing_rows = titanic_test[titanic_test['Fare'].isnull()]
missing_rows

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [28]:
## to correct missing values REMOVING NULL VALUES
# Drop rows with missing values in Cabin columns
titanic_test = titanic_test.dropna(subset=['Cabin'])

In [29]:
## Finding mean/median/mode in AGE
# Assuming 'Age' column has some missing values

# Median
median_age = titanic_test['Age'].median()

# Mode
mode_age = titanic_test['Age'].mode()

# Mean (Note: Be cautious with mean if there are outliers in the data)
mean_age = titanic_test['Age'].mean()

# If there are multiple modes, convert to a list
mode_age_list = mode_age.tolist()

print("Median Age:", median_age)
print("Mode Age:", mode_age_list)
print("Mean Age:", mean_age)


Median Age: 39.0
Mode Age: [30.0, 36.0, 45.0, 48.0, 55.0]
Mean Age: 39.247126436781606


In [30]:
# Replace NaN values in 'Age' column with 36
titanic_test.loc[pd.isnull(titanic_test['Age']), 'Age'] = 39

In [31]:
## Data Cleaning
titanic_test.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [32]:
## How many types of diffenent valies in each column
unique_values = titanic_test.nunique()
print(unique_values)

PassengerId    91
Pclass          3
Name           91
Sex             2
Age            46
SibSp           4
Parch           5
Ticket         70
Fare           59
Cabin          76
Embarked        3
dtype: int64


In [33]:
## Checking for the actual values in the columns
unique_values = titanic_test.apply(lambda x: x.unique())
print(unique_values)


PassengerId    [904, 906, 916, 918, 920, 926, 933, 936, 938, ...
Pclass                                                 [1, 3, 2]
Name           [Snyder, Mrs. John Pillsbury (Nelle Stevenson)...
Sex                                               [female, male]
Age            [23.0, 47.0, 48.0, 22.0, 41.0, 30.0, 39.0, 45....
SibSp                                               [1, 0, 3, 2]
Parch                                            [0, 3, 1, 2, 4]
Ticket         [21228, W.E.P. 5734, PC 17608, 113509, 113054,...
Fare           [82.2667, 61.175, 262.375, 61.9792, 30.5, 57.7...
Cabin          [B45, E31, B57 B59 B63 B66, B36, A21, C78, D34...
Embarked                                               [S, C, Q]
dtype: object


In [34]:
### Create new feature - FAMILY SIZE

# Define a function to create the 'Family' feature
def create_family_feature(row):
    # Sum the SibSp and Parch columns to get the total family size
    family_size = row['SibSp'] + row['Parch']
    return family_size

# Apply the function to create the 'Family' feature
titanic_test['Family'] = titanic_test.apply(create_family_feature, axis=1)

# Display the DataFrame with the new 'Family' feature
titanic_test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_test['Family'] = titanic_test.apply(create_family_feature, axis=1)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family
12,904,1,"Snyder, Mrs. John Pillsbury (Nelle Stevenson)",female,23.0,1,0,21228,82.2667,B45,S,1
14,906,1,"Chaffee, Mrs. Herbert Fuller (Carrie Constance...",female,47.0,1,0,W.E.P. 5734,61.175,E31,S,1
24,916,1,"Ryerson, Mrs. Arthur Larned (Emily Maria Borie)",female,48.0,1,3,PC 17608,262.375,B57 B59 B63 B66,C,4
26,918,1,"Ostby, Miss. Helene Ragnhild",female,22.0,0,1,113509,61.9792,B36,C,1
28,920,1,"Brady, Mr. John Bertram",male,41.0,0,0,113054,30.5,A21,S,0


In [35]:
# Remove the 'PassengerId','Name', 'SibSp', 'Parch' and 'Ticket' columns
titanic_test.drop(['Name', 'SibSp', 'Parch','Ticket'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_test.drop(['Name', 'SibSp', 'Parch','Ticket'], axis=1, inplace=True)


In [36]:
##### ENCODING!!!!!
# Encode categorical variables. Mapping 'Sex' and 'Embarked' into numerical
titanic_test['Sex'] = titanic_test['Sex'].map({'male': 0, 'female': 1})
titanic_test['Embarked'] = titanic_test['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_test['Sex'] = titanic_test['Sex'].map({'male': 0, 'female': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_test['Embarked'] = titanic_test['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})


In [37]:
# Feature Engineering - Cabin
##titanic_train['Deck'] = titanic_train['Cabin'].str[:1]  # Extract first letter of Cabin to get the deck

## use .astype(str) to convert the 'Cabin' column to strings, ensuring that all values are treated as strings.
# Convert 'Cabin' column to strings and extract the first character
# Then, we use .str[0] to extract the first character from each string in the 'Cabin' column.
titanic_test['Cabin'] = titanic_test['Cabin'].astype(str).str[0]

## we map the extracted first characters to numerical values using the cabin_mapping dictionary.
# Map the first character of 'Cabin' to numerical values
cabin_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'T': 7}
titanic_test['Cabin'] = titanic_test['Cabin'].map(cabin_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_test['Cabin'] = titanic_test['Cabin'].astype(str).str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_test['Cabin'] = titanic_test['Cabin'].map(cabin_mapping)


In [38]:
## Check for dtypes
titanic_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91 entries, 12 to 414
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  91 non-null     int64  
 1   Pclass       91 non-null     int64  
 2   Sex          91 non-null     int64  
 3   Age          91 non-null     float64
 4   Fare         91 non-null     float64
 5   Cabin        91 non-null     int64  
 6   Embarked     91 non-null     int64  
 7   Family       91 non-null     int64  
dtypes: float64(2), int64(6)
memory usage: 6.4 KB


In [39]:
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Cabin,Embarked,Family
12,904,1,1,23.0,82.2667,1,0,1
14,906,1,1,47.0,61.175,4,0,1
24,916,1,1,48.0,262.375,1,1,4
26,918,1,1,22.0,61.9792,1,1,1
28,920,1,0,41.0,30.5,0,0,0


In [41]:
# Scale numerical features (Age, Fare)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
titanic_test[['Pclass', 'Sex', 'Age', 'Fare', 'Cabin', 'Family','Embarked']] = scaler.fit_transform(titanic_test[['Pclass', 'Sex', 'Age', 'Fare', 'Cabin', 'Family','Embarked']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_test[['Pclass', 'Sex', 'Age', 'Fare', 'Cabin', 'Family','Embarked']] = scaler.fit_transform(titanic_test[['Pclass', 'Sex', 'Age', 'Fare', 'Cabin', 'Family','Embarked']])


In [42]:
# Feature Selection
selected_features = ['Pclass', 'Sex', 'Age', 'Fare', 'Cabin', 'Family','Embarked']

# Extract selected features from the test dataset
X_test = titanic_test[selected_features]

In [43]:
model_1= model.predict(X_test)
model_1

NameError: name 'model' is not defined

In [40]:
## LOGISITIC REGRESSION

# Scale numerical features (Age, Fare)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
##titanic_test[['Age', 'Fare']] = scaler.fit_transform(test_df[['Age', 'Fare']])

# 2. Feature Selection
selected_features = ['Pclass', 'Sex', 'Age', 'Fare', 'Cabin', 'Family']

# Extract selected features from the test dataset
X_test = titanic_test[selected_features]

# 3. Model Evaluation
# Load the trained Logistic Regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

# Train the model on the train dataset (assuming train dataset is stored in 'train_df')
##logreg.fit(titanic_test[selected_features], titanic_test['Survived'])

# Make predictions on the test dataset
y_pred = logreg.predict(X_test)

# 4. Display model evaluation metrics
print("Accuracy:", accuracy_score(true_labels, y_pred))
print("Precision:", precision_score(true_labels, y_pred))
print("Recall:", recall_score(true_labels, y_pred))
print("F1 Score:", f1_score(true_labels, y_pred))


NotFittedError: This LogisticRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.