# My first data analysis project

In [None]:
# importing packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder

: 

In [None]:
print(pd.__version__)

## Loading the Titanic dataset

In [None]:
titanic_file_path="datasets/train.csv"
df = pd.read_csv(titanic_file_path)
df = df.drop(columns={'PassengerId'})

In [None]:
df.head()

## Describing the dataframe

In [None]:
df.describe()

## Data types of each column

In [None]:
df.info()

## Counting missing values

In [None]:
df.isna().sum() 

## Handling missing values

### filling the na values in the age
* filling age with its median for null values

In [None]:
median_age = df['Age'].median()
# df['Age'].isnull().sum()
df.fillna(median_age,inplace=True)
df['Age'].isnull().sum()

df.head()

### Dropping the cabin column as it has about 77% of data are missing

In [None]:
df.drop('Cabin',axis=1,inplace=True)
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.isna().sum()

### Duplicated values

In [None]:
df.duplicated().sum()

### Distribution of Ages

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(df['Age'],kde=True,color='blue',bins=30)
plt.title('Distribution of Ages',fontsize=16)
plt.xlabel('Price')
plt.ylabel("Frequency")
plt.show()

* Shows a slight +ve skewness

### Scatter plot

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(x='Age',y='Fare',data=df,color='orange')
sns.regplot(x='Age',y='Fare',data=df,scatter=False,color='blue')
plt.title('Age vs Fare')
plt.show()

### Distribution of Fares

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(df['Fare'],kde=True,color='blue',bins=30)
plt.title('Distribution of Fares',fontsize=14)
plt.xlabel('Fares')
plt.ylabel('Frequency')
plt.show()

* shows highly +ve skewness

### Survival Rate 

In [None]:
survival_counts = df['Survived'].value_counts()
total_passengers = len(df)
survived = survival_counts[1]
died = survival_counts[0]
# survival_counts
survival_rate = survived/total_passengers
death_rate =died/total_passengers
print(f"Total Passengers: {total_passengers}")
print(f"Survived: {survived}")
print(f"Died: {died}")
print(f"Survival Rate: {survival_rate:.2%}")
print(f"Death Rate: {death_rate:.2%}")

### Visulaization of Survival Rate

In [None]:
plt.figure(figsize=(8,6))
survival_counts = df['Survived'].value_counts()
labels = ['Died','Survived']
plt.bar(labels,survival_counts.values,color=['red','green'])
plt.title('Survival COunt')
plt.ylabel('Number of Passengers')
for i,v in enumerate(survival_counts.values):
    plt.text(i,v+10,str(v),ha='center')
plt.show()

### Did higher percentage of men or women survived?

In [None]:
df.head(1)

In [None]:
le = LabelEncoder()
df['Sex_encoded'] = le.fit_transform(df['Sex'])
# This labelEncoder will encode the male -> 1 and female-> 0

df.head(1)

### Male Survival Rate

In [None]:
total_male = df.Sex_encoded.loc[df.Sex_encoded== 1].count()
male_survived = df.Sex_encoded.loc[(df.Sex_encoded==1) & df.Survived == 1 ].count()

print(f"Total male passengers:{total_male}")
print(f"Survived Male passengers:{male_survived}")
print(f"Male Passenger Survival Rate: {male_survived/total_male:.2%}")

### Female Survival Rate

In [None]:
total_female = df.Sex_encoded.loc[df.Sex_encoded == 0].count()
female_survived = df.Sex_encoded.loc[(df.Sex_encoded ==0)&df.Survived ==1].count()

print(f"Total Female Passengers: {total_female}")
print(f"Total Female Passenger Survived: {female_survived}")
print(f"Female Passenger Survival Rate: {female_survived/total_female:.2%}")

In [None]:
plt.figure(figsize=(8,6))

labels = ['Male','Female ']
plt.bar(labels,[male_survived,female_survived],color=['blue','pink'])
plt.title('Survival Count')
plt.ylabel('Number of Passengers')
for i,v in enumerate([male_survived,female_survived]):
    plt.text(i,v+1,str(v),ha='center')
plt.show()

### Conclusion:
Looks like Female Passenger Survived more than that of Male Passengers

# Did people in certain passenger classes (Pclass) have a higher chance of survival?

In [None]:
total_pclass3 = df.Pclass.loc[df.Pclass == 3].count()
total_pclass2 = df.Pclass.loc[df.Pclass == 2].count()
total_pclass1 = df.Pclass.loc[df.Pclass == 1].count()

print(f"Total Passenger in Class 3: {total_pclass3}")
print(f"Total Passenger in Class 2: {total_pclass2}")
print(f"Total Passenger in Class 1: {total_pclass1}")

total_passenger_survived_p3 = df.Pclass.loc[(df.Pclass==3)&(df.Survived==1)].count()
total_passenger_survived_p2 = df.Pclass.loc[(df.Pclass==2)&(df.Survived==1)].count()
total_passenger_survived_p1 = df.Pclass.loc[(df.Pclass==1)&(df.Survived==1)].count()

print(f"Survival rate of Passenger in Pclass 3: {total_passenger_survived_p3/total_pclass3:.2%}")
print(f"Survival rate of Passenger in Pclass 2: {total_passenger_survived_p2/total_pclass2:.2%}")
print(f"Survival rate of Passenger in Pclass 1: {total_passenger_survived_p1/total_pclass1:.2%}")

In [None]:
plt.figure(figsize=(5,6))

labels = ['Class 1','Class 2','Class 3']
plt.bar(labels,[total_passenger_survived_p1,total_passenger_survived_p2,total_passenger_survived_p3],color=['blue','green','red'])
plt.title('Survival Count')
plt.ylabel('Number of Passengers')
for i,v in enumerate([total_passenger_survived_p1,total_passenger_survived_p2,total_passenger_survived_p3]):
    plt.text(i,v+1,str(v),ha='center')
plt.show()

### Coclusion: 

* seems like class 1 passenger has higher survival rate

# Building Machine Learning Model

In [None]:
features = ['Pclass','Sex_encoded','Age','Fare']
X = df[features]
y = df['Survived']

## Splitting the data

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.2,random_state=42)

## fitting the model ( Linear Regression Model)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

## Prediction from the model

In [None]:
y_pred = model.predict(X_test)

## Evaluation 

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error,accuracy_score,confusion_matrix
r2 = r2_score(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
print(f"R2 score: {r2:.2f}")
print(f"mae score: {mae:.2f}")

In [None]:
import joblib

joblib.dump(model,"ML_model/Linear_regression.pkl")

### Calculating the models accuracy

# #accuracy = accuracy_score(y_test,y_pred)

* caused an error as the linear regression is used for regression problems where the values are a continuous
* for ex: house price prediction, person's age prediction

## Logisitic Regression for Classification Problem

* predicting discrete category or class
* like predicting if email is "spam" or "not spam"
* if tumour is "maligant" or "bengin"

* which is best for this problem that is titanic survival

## Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(solver='liblinear')

# Train the model
log_model.fit(X_train,y_train)

## Prediction using logistic regression model

In [None]:
y_pred=log_model.predict(X_test)

## Calculate the accuracy of the model

In [None]:
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy: {accuracy:.2%}")

### Dumping the logistic regression model

In [None]:
joblib.dump(log_model,'ML_model/Logistic_regression.pkl')

### considering we have not handled outliers

## Confusion Matrix

In [None]:
conf_matrix = confusion_matrix(y_test,y_pred)

In [None]:
conf_matrix_df = pd.DataFrame(conf_matrix, index=['Actual 0 (No)', 'Actual 1 (Yes)'], columns=['Predicted 0 (No)', 'Predicted 1 (Yes)'])
sns.set(font_scale=1.2)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_df, annot=True, fmt='d', cmap='Blues', cbar=False)

plt.title('Confusion Matrix for Titanic Survival Prediction')
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.show()

# Extracting Info through Feature engineering

## Family Size

In [None]:
df['FamilySize'] = df['SibSp'] + df['Parch'] +1
df.FamilySize

In [None]:
df["IsAlone"] = (df['FamilySize']==1).astype(int)

In [None]:
df.IsAlone

In [None]:
labels = ['Child','Teen','Adult','Senior']
df['AgeGroup'] =pd.cut(df['Age'],bins=[0,12,18,65,100],labels=labels)
df.sample()

## Correlation Matrix

In [None]:
numerical_cols = df.select_dtypes(include='number').columns

correlation_matrix = df[numerical_cols].corr()
plt.figure(figsize=(10,7))
sns.heatmap(correlation_matrix,annot=True)

## Cross Validation 

In [None]:
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(log_model,X,y,cv=5)
print(f"CV accuracy: {cv_scores.mean():.3f}(+/- {cv_scores.std() * 2:.3f})")

## Testing the model using the test dataset

In [None]:
test_df = pd.read_csv('datasets/test.csv')
test_df['Sex_encoded'] = le.transform(test_df['Sex'])
test_df.fillna(median_age,inplace=True)
Xt = test_df[features]
yt = log_model.predict(Xt)
test_df['Survived_Prediction'] = yt

In [None]:
died =test_df.Survived_Prediction.loc[test_df.Survived_Prediction==0].count()
print(f"No. of death predicted: {died}")
survived = test_df.Survived_Prediction.loc[test_df.Survived_Prediction == 1].count()
print(f"No. of Survived Prediction: {survived}\n")

predicted_survival_rate = survived/ test_df.PassengerId.count()
predicted_death_rate = died/ test_df.PassengerId.count()
print(f"Training Survival  Rate: {survival_rate:.2%}")
print(f"Training Death Rate: {death_rate:.2%}\n")
print(f"Predicted survival rate : {predicted_survival_rate:.2%}")
print(f"Predicted death rate : {predicted_death_rate:.2%}")

# Hypothesis Testing
* Hypothesis (H0) : There is no difference in survival rates between passenger class
* Alternate (H1) : There is significant difference in survival rates between passenger class

In [None]:
# Using ANOVA test
from scipy.stats import f_oneway
alpha = 0.05
class1_survival = df[df['Pclass']==1]['Survived']
class2_survival = df[df['Pclass']==2]['Survived']
class3_survival = df[df['Pclass']==3]['Survived']

f_stat,p_value = f_oneway(class1_survival,class2_survival,class3_survival)
print(f"ANOVA F-stat: {f_stat:.4f}")
print(f"P-value: {p_value:.6f}")

if p_value < alpha:
    print(f"Passenger class significantly affects the Survival rate")
else:
    print("Passenger class does not affects the Survival rate")


# Conclusion:
## This data clearly shows that the higher class passenger was major factor in Titanic survival rates