In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud 

In [None]:
df = pd.read_csv("Titanic-Dataset.csv")

## Exploratory Data Analysis

In [None]:
df.head()

### PassengerID

In [None]:
df["PassengerId"].describe()

### Survived

In [None]:
sns.countplot(x=df['Survived'])

### PClass

In [None]:
df["Pclass"].unique()

In [None]:
class_counts = df['Pclass'].value_counts()

# Create a pie chart
plt.figure(figsize=(8, 8))
plt.pie(class_counts, labels=class_counts.index, autopct='%1.1f%%', startangle=90)
plt.axis('equal')
plt.title('Passenger Class Distribution')
plt.show()

In [None]:
df.columns

### Name

In [None]:
'''Creating a wordcloud for name to help in the visualization'''

In [None]:
import re

# Generate word clouds for each unique name
unique_names = df['Name'].unique()
for name in unique_names:
    # Sanitize the name for use as a filename
    sanitized_name = re.sub(r'[^\w\s]', '_', name)  # Replace invalid characters with underscores
    sanitized_name = re.sub(r'\s+', '_', sanitized_name)  # Replace spaces with underscores
    # Filter DataFrame for the specific name
    name_df = df[df['Name'] == name]
    
    # Concatenate all instances of the name into a single string
    text = ' '.join(name_df['Name'])
    
    # Create a word cloud
    wordcloud = WordCloud(width=400, height=400, 
                          background_color='white', 
                          stopwords=None, 
                          min_font_size=10).generate(text)
    
    # Display the word cloud
    plt.figure(figsize=(8, 8), facecolor=None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad=0) 
    
    # Save the word cloud to a file with sanitized name
    sanitized_name = sanitized_name.replace(" ", "_")  # Replace spaces with underscores
    plt.savefig(f"{sanitized_name}_wordcloud.png")
    plt.show()


In [None]:
df.columns

### Gender

In [None]:
df["Sex"].unique()

In [None]:
''' Only 2 unique values ( of course) '''

In [None]:
sns.countplot(x=df['Sex'], hue=df['Survived'])

In [None]:
df.columns

### Age

In [None]:
df.isnull().sum()

In [None]:
# Plot a histogram
plt.figure(figsize=(8, 6))
plt.hist(df['Age'], bins=10, color='skyblue', edgecolor='black')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Histogram of Age')
plt.show()

In [None]:
df.columns

### SibSp

In [None]:
freq_mapping = df['SibSp'].value_counts().sort_index()

# Create a line plot
plt.figure(figsize=(10, 6))
plt.plot(freq_mapping.index, freq_mapping.values, marker='o')
plt.xlabel('SibSp')
plt.ylabel('Frequency')
plt.title('Frequency Mapping of SibSp')
plt.grid(True)
plt.show()

In [None]:
df.columns

### Parch

In [None]:
freq_mapping = df['Parch'].value_counts().sort_index()

# Create a line plot
plt.figure(figsize=(10, 6))
plt.plot(freq_mapping.index, freq_mapping.values, marker='o')
plt.xlabel('Parch')
plt.ylabel('Frequency')
plt.title('Frequency Mapping of Parch')
plt.grid(True)
plt.show()

In [None]:
df.columns

### Ticket

In [None]:
df["Ticket"].describe()

In [None]:
df["Ticket"].unique()

### Fare

In [None]:
df["Fare"].value_counts()

In [None]:
df["Fare"].describe()

In [None]:
df.columns

In [None]:
# Function to check if a value can be converted to an integer
def is_convertible_to_int(value):
    try:
        int(value)
        return True
    except ValueError:
        return False

# Apply the function to the column and create a new column
df['ConvertibleToInt'] = df['Fare'].apply(is_convertible_to_int)

# Create a countplot
plt.figure(figsize=(8, 6))
sns.countplot(x='ConvertibleToInt', data=df)
plt.xlabel('Convertible to Int')
plt.ylabel('Count')
plt.title('Count of Values Convertible to Int vs Not Convertible')
plt.show()

df.drop("ConvertibleToInt" , axis = 1 , inplace = True)

### Cabin

In [None]:
# Create a bar plot
plt.figure(figsize=(8, 6))
sns.countplot(x='Cabin', data=df)
plt.xlabel('Cabin')
plt.ylabel('Frequency')
plt.title('Frequency of String Values')
plt.xticks(rotation=45)
plt.show()

In [None]:
'''As you can see, its a mess xd'''

### Embarked

In [None]:
df["Embarked"].unique()

In [None]:
class_counts = df['Embarked'].value_counts()

plt.figure(figsize=(8, 8))
plt.pie(class_counts, labels=class_counts.index, autopct='%1.1f%%', startangle=90)
plt.axis('equal')
plt.title('Passenger Class Distribution')
plt.show()

## Feature Engineering

In [None]:
#rename id, drop name
df["Id"] = df["PassengerId"]
df.drop("PassengerId" , axis = 1 , inplace = True)
df.drop("Name" , axis =1  , inplace = True)

In [None]:
# rearranging the columns 
df = df[['Id', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked' , "Survived"]]

### LabelEncoding on -> Embarked , Sex

In [None]:
from sklearn.preprocessing import LabelEncoder

df['Embarked'] = df['Embarked'].fillna("0")
encoder = LabelEncoder()
df['Embarked'] = encoder.fit_transform(df['Embarked'])
df["Sex"] = encoder.fit_transform(df["Sex"])

### Dealing with NULL values

In [None]:
df.isnull().sum()

In [None]:
# Replacing the null values of age with the mean 
mean_age = df['Age'].mean()
df['Age'] = df['Age'].fillna(mean_age)

## Feature Selection

In [None]:
df.columns

In [None]:
from scipy.stats import pearsonr

correlations = {}
for column in df.columns[:-1]:  # Exclude the last column (Survived)
    corr, _ = pearsonr(df[column], df['Survived'])
    correlations[column] = corr

print("Pearson correlation coefficients: \n")

for column, corr in correlations.items():
    print(f"{column}: {corr}")

In [None]:
''' HEATMAP OF PEARSON CORRELATION'''

corr_matrix = df.corr()

# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Pearson Correlation Heatmap')
plt.show()

In [None]:
df.columns

In [None]:
'''Chi-square test to check the degree of independence with respect to our Survived column'''
from scipy.stats import chi2_contingency

print(f"Chi-square statistic for: \n")
columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
results = []

for col in columns:
    contingency_table = pd.crosstab(df[col], df['Survived'])
    chi2, p_value, _, _ = chi2_contingency(contingency_table)
    results.append({'Column': col, 'Chi-square': chi2, 'P-value': p_value})
    print(f"{col}: {chi2} ")
    print(f"P-value for {col}: {p_value} \n")

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Statistical representation using a table
print(results_df)

# Graphical representation using a bar plot
plt.figure(figsize=(12, 6))
sns.barplot(x='Column', y='Chi-square', data=results_df, palette='viridis')
plt.title('Chi-square statistic for each column')
plt.xlabel('Column')
plt.ylabel('Chi-square')
plt.xticks(rotation=45)
plt.show()

In [None]:
df.columns

In [None]:
df.drop("Id" , axis = 1 , inplace = True)

## Model Building

In [None]:
X = df.drop("Survived" , axis = 1)
Y = df[["Survived"]]

In [None]:
# Train test split ratio -> 15%
# random statae 42

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
y_train.head()

In [None]:
y_test.head()

In [None]:
X_train = X_train.values
y_train = y_train.values
X_test = X_test.values
y_test = y_test.values

In [None]:
X_train = X_train.T
y_train = y_train.reshape(1, X_train.shape[1])

X_test = X_test.T
y_test = y_test.reshape(1, X_test.shape[1])

In [None]:
print("Shape of X_train : ", X_train.shape)
print("Shape of Y_train : ", y_train.shape)
print("Shape of X_test : ", X_test.shape)
print("Shape of Y_test : ", y_test.shape)

# Model

In [None]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [None]:
def model(X, Y, learning_rate, iterations):
    
    m = X_train.shape[1]
    n = X_train.shape[0]
    
    W = np.zeros((n,1))
    B = 0
    
    cost_list = []
    
    for i in range(iterations):
        
        Z = np.dot(W.T, X) + B
        A = sigmoid(Z)
        
        # cost function
        cost = -(1/m)*np.sum( Y*np.log(A) + (1-Y)*np.log(1-A))
        
        # Gradient Descent
        dW = (1/m)*np.dot(A-Y, X.T)
        dB = (1/m)*np.sum(A - Y)
        
        W = W - learning_rate*dW.T
        B = B - learning_rate*dB
        
        # Keeping track of our cost function value
        cost_list.append(cost)
        
        if(i%(iterations/10) == 0):
            print("cost after ", i, "iteration is : ", cost)
        
    return W, B, cost_list

In [None]:
iterations = 100000
learning_rate = 0.0090
W, B, cost_list = model(X_train, y_train, learning_rate = learning_rate, iterations = iterations)

# Testing Model Accuracy

In [None]:
def accuracy(X, Y, W, B):
    
    Z = np.dot(W.T, X) + B
    A = sigmoid(Z)
    
    A = A > 0.5
    
    A = np.array(A, dtype = 'int64')
    
    acc = (1 - np.sum(np.absolute(A - Y))/Y.shape[1])*100
    
    print("Accuracy of the model is : ", round(acc, 2), "%")

In [None]:
accuracy(X_test, y_test, W, B)