In [1]:
import pandas as pd
from IPython.display import display
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [2]:
file = "corrupted_titanic_data.csv"
df = pd.read_csv(file)

pd.set_option('display.max_rows', None) 
pd.set_option('display.max_columns', None)  
pd.set_option('display.width', None)  
pd.set_option('display.max_colwidth', None)  

In [None]:
# dropped rows with null values
df_full = df.dropna().reset_index()

In [4]:
#foucses mostly on dropping columns that were redundant or were derived and there usefulness was in question
# i kept some derived columns like alone and adult_male as those are important data points to visualize
# additionally, fare and pclass are most likely directly correlated, but kept both 
df_full_necc = df_full.drop(columns=['index', 'pclass_redundant', 'class', 'alive', 'embark_town', 'who', 'fare_age_combination']) 


In [5]:
# first, i was having many issues and realized i needed to ensure all ages were numbers (not strings)
df_full_necc["age"] = pd.to_numeric(df_full_necc["age"], errors="coerce")

#then, i changed all the ages greater than 100 and less than 0 to none (to ensure there were not false ages)
df_full_necc.loc[(df_full_necc["age"] >= 100), "age"] = None
df_full_necc.loc[(df_full_necc["age"] <= 0), "age"] = None

#then, i updated all these none values to the median so that those rows can still be useful, despite the age being corrupted
avg_age = df_full_necc["age"].median()
df_full_necc["age"].fillna(avg_age, inplace=True)


# fare prices below 0 -> similar to age, i changed negatives to the median
df_full_necc.loc[(df_full_necc["fare"] <= 0), "fare"] = None
avg_fare = df_full_necc["fare"].median()
df_full_necc["fare"].fillna(avg_fare, inplace=True)


# sex and embarked data needs cleaning -> making all entries lowercase (there were inconsistencies)
df_full_necc["sex"] = df_full_necc["sex"].astype(str).str.lower()
df_full_necc["embarked"] = df_full_necc["embarked"].str.lower()

# the fare prices needed to be rounded to 2 decimal places to act like real money
df_full_necc["fare"] = df_full_necc["fare"].round(2)




In [None]:
#needed to encode all true/false columns as 0's and 1's
df_full_necc["adult_male"] = df_full_necc["adult_male"].astype(int)
df_full_necc["alone"] = df_full_necc["alone"].astype(int)


#needed to do similar thing as (1) with sex (male and female), and encode the values as 1's and 0's
df_full_necc["sex"] = df_full_necc["sex"].map({"male": 1, "female": 0})


# for columns with several possible values, I used hot-encoding, which allows to use 1's and 0's by 
# creating more columns (one for each possible value), and using 1's and 0's to symbolize its existance
df_full_necc = pd.get_dummies(df_full_necc, columns=["embarked"])
df_full_necc = pd.get_dummies(df_full_necc, columns=["deck"])


# lastly, used scaler to scale fare and age values, as they made up a large range, and I didn't want this to effect accuracy
scaler = StandardScaler()
df_full_necc["fare"] = scaler.fit_transform(df_full_necc[["fare"]])
df_full_necc["age"] = scaler.fit_transform(df_full_necc[["age"]])





In [None]:
# training logistic regression model

X = df_full_necc.drop("survived", axis=1)
y = df_full_necc["survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ")
print(accuracy)

classificationreport = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(classificationreport)

confusionmatrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(confusionmatrix)

