Cleaning Train Dataset

In [64]:
import pandas as pd

df = pd.read_csv('train.csv')

#df['Cabin'].isnull().sum()

#df['Cabin']
#Replace missing values with the most common value in the column (Cabin).
df.fillna({'Cabin': df['Cabin'].mode()[0]}, inplace=True)

#Replace missing values with the most common value in the column (Embarked).
df.fillna({'Embarked': df['Embarked'].mode()[0]}, inplace=True)

#Create a new column(Column_Age) & fill all the missing values with the mean mode
df['Column_Age'] = df['Age'].fillna(df['Age'].mean())

#df['Column_Age'].count()

#Add 'Column_Age to the data frame
df['Age'] = df['Column_Age']

#Drop the 'Age' column
df = df.drop('Age', axis=1)
df.isnull().sum()


df.to_csv("cleaned_train_dataset.csv")


Encode Train Dataset

In [65]:
# Import necessary libraries
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score

train_encode = pd.read_csv('cleaned_train_dataset.csv')
train_encode = pd.DataFrame(train_encode)

# Separate features and target
X = train_encode[['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']]
y = train_encode['Survived']

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first')  # `drop='first'` removes the first category to avoid multicollinearity

# Fit and transform the features
X_encoded = encoder.fit_transform(X)

# Convert to DataFrame and add column names
encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(X.columns))

# Concatenate encoded features with target variable
df_encoded = pd.concat([encoded_df, y], axis=1)

#print(df_encoded)

# Save the encoded dataframe to a CSV file
df_encoded.to_csv("encoded_train_data.csv", index=False)


Model Training (LogisticRegression)

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

# Load and prepare data
data = pd.read_csv('encoded_train_data.csv')

# Specify the name of the target column
target_column = 'Survived'  # Replace with your actual target column name

# Separate features and target
X = data.drop(columns=[target_column])  # Features (independent variables)
y = data[target_column]  # Target (dependent variable)

# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
model = LogisticRegression()

# Fit the model with the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

# Save the trained model
joblib.dump(model, 'model_1')





Accuracy: 0.7932960893854749
Confusion Matrix:
[[86 19]
 [18 56]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.82      0.82       105
           1       0.75      0.76      0.75        74

    accuracy                           0.79       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.79      0.79      0.79       179



['model_1']

Cleaning Test Dataset

In [67]:
import pandas as pd

df = pd.read_csv('test.csv')

#df['Cabin'].isnull().sum()

#df['Cabin']
#Replace missing values with the most common value in the column (Cabin).
df.fillna({'Cabin': df['Cabin'].mode()[0]}, inplace=True)

#Replace missing values with the most common value in the column (Embarked).
df.fillna({'Embarked': df['Embarked'].mode()[0]}, inplace=True)

#Fill the missing value in Fare with the mean
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

#Create a new column(Column_Age) & fill all the missing values with the mean mode
df['Column_Age'] = df['Age'].fillna(df['Age'].mean())

#df['Column_Age'].count()

#Add 'Column_Age to the data frame
df['Age'] = df['Column_Age']

#Drop the 'Age' column
df = df.drop('Age', axis=1)
df.isnull().sum()

df.to_csv("cleaned_test_dataset.csv")


Encode Test Dataset

In [74]:
# Import necessary libraries
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score

train_encode = pd.read_csv('cleaned_test_dataset.csv')
train_encode = pd.DataFrame(train_encode)

# Separate features and target
X = train_encode[['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']]
y = train_encode['PassengerId']

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first')  # `drop='first'` removes the first category to avoid multicollinearity

# Fit and transform the features
X_encoded = encoder.fit_transform(X)

# Convert to DataFrame and add column names
encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(X.columns))

# Concatenate encoded features with target variable
df_encoded = pd.concat([encoded_df, y], axis=1)

#print(df_encoded)

# Save the encoded dataframe to a CSV file
df_encoded.to_csv("encoded_test_dataset.csv", index=False)


Predict The Passengers

In [75]:
import pandas as pd
import joblib

# Load the model
model = joblib.load('model_1')

# Load the training data to get the original feature names
training_data = pd.read_csv('encoded_train_data.csv')  # Replace with the path to your encoded training data
training_features = training_data.drop(columns=['Survived']).columns  # Get the feature names from the training data

# Load the test data
test_data = pd.read_csv('encoded_test_dataset.csv')

# Specify the name of the target column (if it exists in test data, remove it)
target_column = 'Survived' ,'Passengerid'  # Replace with your actual target column name, if needed

# Remove the target column if it exists in test data
if target_column in test_data.columns:
    test_data = test_data.drop(columns=[target_column])

# Reorder the columns in test_data to match the training data, filling missing columns with 0
test_data = test_data.reindex(columns=training_features, fill_value=0)

# Make predictions for each row in the test data
predictions = model.predict(test_data)

# Create an IndexID column with values from 0 to 417 (or len(predictions) - 1)
index_ids = list(range(892, 1310))

# Create a DataFrame with IndexID and Survived columns
results = pd.DataFrame({
    'PassengerID': index_ids,
    'Survived': predictions
})

# Display the results
print(results)

# Optionally, save the results to a CSV file
results.to_csv('predictions.csv', index=False)


     PassengerID  Survived
0            892         0
1            893         1
2            894         0
3            895         0
4            896         1
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         1

[418 rows x 2 columns]


# New Section