<a href="https://colab.research.google.com/github/KHN70/CN6005-Ai-Weekly-Portfolio/blob/main/Week_3Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd

# Load the 'titanic (1).csv' file into a pandas DataFrame
df = pd.read_csv('/content/titanic (1).csv')

print("First 5 rows of the DataFrame:")
# Display the first 5 rows of the DataFrame
print(df.head())

print("\nConcise summary of the DataFrame:")
# Display a concise summary of the DataFrame
df.info()

print("\nDescriptive statistics of the DataFrame:")
# Generate descriptive statistics of the DataFrame
print(df.describe())

print("\nMissing values in each column:")
# Check for missing values in each column and display their counts
print(df.isnull().sum())

First 5 rows of the DataFrame:
   PassengerId  Pclass                                               Name  \
0            1       3                            Braund, Mr. Owen Harris   
1            2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3       3                             Heikkinen, Miss. Laina   
3            4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4            5       3                           Allen, Mr. William Henry   

      Sex   Age            Ticket     Fare  Survived  
0    male  22.0         A/5 21171   7.2500         0  
1  female  38.0          PC 17599  71.2833         1  
2  female  26.0  STON/O2. 3101282   7.9250         1  
3  female  35.0            113803  53.1000         1  
4    male  35.0            373450   8.0500         0  

Concise summary of the DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dty

In [6]:
print("Before preprocessing, missing values:")
print(df.isnull().sum())

# 1. Impute missing 'Age' values with the mean
df['Age'].fillna(df['Age'].mean(), inplace=True)

# 2. Convert the categorical 'Sex' column into numerical format using one-hot encoding
df = pd.get_dummies(df, columns=['Sex'], drop_first=True) # drop_first=True to avoid multicollinearity

# 3. Drop the 'PassengerId', 'Name', and 'Ticket' columns from the DataFrame
df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

print("\nFirst 5 rows of the preprocessed DataFrame:")
print(df.head())

print("\nConcise summary of the preprocessed DataFrame:")
df.info()

print("\nMissing values in each column after preprocessing:")
print(df.isnull().sum())

Before preprocessing, missing values:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
Ticket           0
Fare             0
Survived         0
dtype: int64

First 5 rows of the preprocessed DataFrame:
   Pclass   Age     Fare  Survived  Sex_male
0       3  22.0   7.2500         0      True
1       1  38.0  71.2833         1     False
2       3  26.0   7.9250         1     False
3       1  35.0  53.1000         1     False
4       3  35.0   8.0500         0      True

Concise summary of the preprocessed DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Age       891 non-null    float64
 2   Fare      891 non-null    float64
 3   Survived  891 non-null    int64  
 4   Sex_male  891 non-null    bool   
dtypes: bool(1), float64(2), int64(2)
memory usage: 28.8 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)


In [7]:
import pandas as pd

# Reload the original dataset to ensure 'Sex' column is present
df = pd.read_csv('/content/titanic (1).csv')

print("Before preprocessing, missing values:")
print(df.isnull().sum())

# 1. Impute missing 'Age' values with the mean
df['Age'] = df['Age'].fillna(df['Age'].mean())

# 2. Convert the categorical 'Sex' column into numerical format using one-hot encoding
df = pd.get_dummies(df, columns=['Sex'], drop_first=True)

# 3. Drop the 'PassengerId', 'Name', and 'Ticket' columns from the DataFrame
df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

print("\nFirst 5 rows of the preprocessed DataFrame:")
print(df.head())

print("\nConcise summary of the preprocessed DataFrame:")
df.info()

print("\nMissing values in each column after preprocessing:")
print(df.isnull().sum())

Before preprocessing, missing values:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
Ticket           0
Fare             0
Survived         0
dtype: int64

First 5 rows of the preprocessed DataFrame:
   Pclass   Age     Fare  Survived  Sex_male
0       3  22.0   7.2500         0      True
1       1  38.0  71.2833         1     False
2       3  26.0   7.9250         1     False
3       1  35.0  53.1000         1     False
4       3  35.0   8.0500         0      True

Concise summary of the preprocessed DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Age       891 non-null    float64
 2   Fare      891 non-null    float64
 3   Survived  891 non-null    int64  
 4   Sex_male  891 non-null    bool   
dtypes: bool(1), float64(2), int64(2)
memory usage: 28.8 

In [8]:
from sklearn.preprocessing import StandardScaler

# Separate features (X) and target (y)
X = df.drop('Survived', axis=1)
y = df['Survived']

# Initialize the StandardScaler
scaler = StandardScaler()

# Identify numerical columns for scaling (excluding the target and already one-hot encoded 'Sex_male')
# 'Sex_male' is boolean, Pclass, Age, Fare are numerical features.
numerical_cols = ['Pclass', 'Age', 'Fare']

# Scale the numerical features
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

print("First 5 rows of features (X) after scaling numerical columns:")
print(X.head())

print("\nFirst 5 rows of target (y):")
print(y.head())

First 5 rows of features (X) after scaling numerical columns:
     Pclass       Age      Fare  Sex_male
0  0.827377 -0.592481 -0.502445      True
1 -1.566107  0.638789  0.786845     False
2  0.827377 -0.284663 -0.488854     False
3 -1.566107  0.407926  0.420730     False
4  0.827377  0.407926 -0.486337      True

First 5 rows of target (y):
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


In [9]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

print("\nFirst 5 rows of X_train:")
print(X_train.head())


Shape of X_train: (712, 4)
Shape of X_test: (179, 4)
Shape of y_train: (712,)
Shape of y_test: (179,)

First 5 rows of X_train:
       Pclass       Age      Fare  Sex_male
331 -1.566107  1.215947 -0.074583      True
733 -0.369365 -0.515526 -0.386671      True
382  0.827377  0.177063 -0.488854      True
704  0.827377 -0.284663 -0.490280      True
813  0.827377 -1.823750 -0.018709     False


In [10]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN classifier with n_neighbors=5 (a common starting point)
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Train the KNN classifier using the training data
knn_classifier.fit(X_train, y_train)

print("KNN classifier trained successfully with n_neighbors=5.")

KNN classifier trained successfully with n_neighbors=5.


In [11]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression classifier with a specified random_state for reproducibility
logistic_regression_classifier = LogisticRegression(random_state=42, solver='liblinear')

# Train the Logistic Regression classifier using the training data
logistic_regression_classifier.fit(X_train, y_train)

print("Logistic Regression classifier trained successfully.")

Logistic Regression classifier trained successfully.


In [12]:
from sklearn.metrics import accuracy_score

# Make predictions on the test set using the KNN classifier
y_pred_knn = knn_classifier.predict(X_test)

# Calculate the accuracy of the KNN classifier
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"KNN Classifier Accuracy: {accuracy_knn:.4f}")

# Make predictions on the test set using the Logistic Regression classifier
y_pred_lr = logistic_regression_classifier.predict(X_test)

# Calculate the accuracy of the Logistic Regression classifier
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Classifier Accuracy: {accuracy_lr:.4f}")

KNN Classifier Accuracy: 0.8156
Logistic Regression Classifier Accuracy: 0.7989
