In [77]:
# Import tools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score #Added for accuracy score at the bottom 

In [40]:
# Load the passenger data
passengers = pd.read_csv('passengers.csv')
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [41]:
# Fill the nan values in the age column  #DOBBIE
passengers['Age'].fillna(value = round(passengers['Age'].mean()), inplace = True)
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [42]:
# Create a first class column
passengers['FirstClass'] = passengers.Pclass.apply( lambda p: 1 if p == 1 else 0)
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [43]:
passengers['SecondClass'] = passengers.Pclass.apply( lambda p: 1 if p == 2 else 0)
passengers['Sex_binary'] = passengers.Sex.map({"male": 0, "female": 1})
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass,SecondClass,Sex_binary
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,0,0


In [44]:
#It selects the relevant features for the prediction and the target variable.
features = passengers[['Age', 'FirstClass', 'SecondClass', 'Sex_binary']]
target = passengers['Survived']

In [80]:
#splits the data into training and testing sets to evaluate the model's performance on unseen data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [81]:
#standardizes the features to have a mean of 0 and a standard deviation of 1,
#which is particularly important for logistic regression  to ensure all features contribute equally to 
#the prediction
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
Y_train_scaled = scaler.fit_transform(X_train)
Y_test_scaled = scaler.transform(X_test)

In [82]:
# Creating the logistic regression model
model = LogisticRegression()

In [83]:
# Training the model with the training data
model.fit(X_train_scaled, y_train)

In [84]:
# You can now use model.predict(X_test_scaled) to make predictions on the test set
# And evaluate the model's performance
predictions = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, predictions)

In [85]:
print(f'Model Accuracy: {accuracy:.2f}')

Model Accuracy: 0.80


In [86]:
# 80% is the highest I have been able to get this model. It sat at 10% and 50% for forever

# Scalers 

#### Robust Scaler 

In [87]:
# Robust Scaler 
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
import numpy as np

scaler = RobustScaler()

# Without this I get an error saying that "NameError: name 'Y_train' is not defined"
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert y_train and y_test to NumPy arrays before reshaping
y_train_scaled = scaler.fit_transform(np.array(y_train).reshape(-1, 1))  
y_test_scaled = scaler.transform(np.array(y_test).reshape(-1, 1))  

In [88]:
# Creating the logistic regression model
model = LogisticRegression()

In [89]:
# Training the model with the training data
model.fit(X_train_scaled, y_train)

In [90]:
# You can now use model.predict(X_test_scaled) to make predictions on the test set
# And evaluate the model's performance
predictions = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, predictions)

In [91]:
print(f'Model Accuracy: {accuracy:.2f}')

Model Accuracy: 0.80


#### MinMax Scaler

In [92]:
#MinMax Scaler
from sklearn.preprocessing import MinMaxScaler

# Without this I get an error saying that "NameError: name 'Y_train' is not defined"
X_train, X_test, Y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

scaler = MinMaxScaler()

# Without this I get an error saying that "NameError: name 'Y_train' is not defined"
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert y_train and y_test to NumPy arrays before reshaping
y_train_scaled = scaler.fit_transform(np.array(y_train).reshape(-1, 1))  
y_test_scaled = scaler.transform(np.array(y_test).reshape(-1, 1)) 

In [93]:
# Creating the logistic regression model
model = LogisticRegression()

In [94]:
# Training the model with the training data
model.fit(X_train_scaled, y_train)

In [95]:
# You can now use model.predict(X_test_scaled) to make predictions on the test set
# And evaluate the model's performance
predictions = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, predictions)

print(f'Model Accuracy: {accuracy:.2f}')  #It went down from 80% to 79%

Model Accuracy: 0.79


#### QuantileTransformer

In [96]:
#QuantileTransformer
from sklearn.preprocessing import QuantileTransformer

scaler = QuantileTransformer()

# Instantiate the QuantileTransformer scaler with a specified number of quantiles
# You can adjust the number of quantiles as needed
scaler = QuantileTransformer(n_quantiles=100)

X_train, X_test, Y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert y_train and y_test to NumPy arrays before reshaping
y_train_scaled = scaler.fit_transform(np.array(y_train).reshape(-1, 1))  
y_test_scaled = scaler.transform(np.array(y_test).reshape(-1, 1)) 

In [97]:
# Creating the logistic regression model
model = LogisticRegression()

# Training the model with the training data
model.fit(X_train_scaled, y_train)

In [98]:
# You can now use model.predict(X_test_scaled) to make predictions on the test set
# And evaluate the model's performance
predictions = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, predictions)

print(f'Model Accuracy: {accuracy:.2f}') #It keep going down? 79--->78

Model Accuracy: 0.78


# Different Regressions 

#### Lasso Regression 

In [109]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error  

X_train, X_test, Y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert y_train and y_test to NumPy arrays before reshaping
y_train_scaled = scaler.fit_transform(np.array(y_train).reshape(-1, 1))  
y_test_scaled = scaler.transform(np.array(y_test).reshape(-1, 1)) 

# Instantiate the Lasso Regression model
lasso_model = Lasso()

# Fit the model to the training data
lasso_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = lasso_model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.2f}')


Model Accuracy: 0.78


#### Decision Tree 

In [103]:
from sklearn.tree import DecisionTreeRegressor

# Assuming 'X' is your feature matrix and 'y' is your target variable

# Instantiate the Decision Tree Regression model
tree_model = DecisionTreeRegressor()

# Fit the model to the training data
tree_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = tree_model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.2f}')


Model Accuracy: 0.78


#### Random Forest

In [104]:
from sklearn.ensemble import RandomForestRegressor

# Assuming 'X' is your feature matrix and 'y' is your target variable

# Instantiate the Random Forest Regression model
forest_model = RandomForestRegressor()

# Fit the model to the training data
forest_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = forest_model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.2f}')


Model Accuracy: 0.78
