In [10]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [11]:
# Read the training dataset
df = pd.read_csv('D:\\SensViz\\Machine Learning Task 5\\train.csv')

In [12]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [13]:
# Perform data preprocessing
df.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [14]:
#drop the missing columns
df.dropna(inplace=True)

In [15]:
# creating a instant of label encoder
label_encoder = LabelEncoder()

In [16]:
# preprocessing on the train data set
# converting the transported column into 0,1 
df['Transported'] = label_encoder.fit_transform(df['Transported'])
# converting the VIP into 0,1
df['VIP'] = label_encoder.fit_transform(df['VIP'])
# convert ting it into numeric form
df['CryoSleep'] = label_encoder.fit_transform(df['CryoSleep'])
# droping the name column
df = df.drop('Name', axis=1)
# removing the null space and convert into lover case
df["HomePlanet"] = df["HomePlanet"].str.lower().str.replace(' ', '')
# convert into lower case
df['HomePlanet'] = label_encoder.fit_transform(df['HomePlanet'])
# conversion of passenger id into float type
df['PassengerId'] = df['PassengerId'].str.replace('_', '').astype(float)
#removing symbols conversion into lover case also in numeric form
df['Cabin'] = df['Cabin'].str.replace('[0-9/]', '', regex=True)
df['Cabin'] = label_encoder.fit_transform(df['Cabin'])
#removing symbols conversion into lover case also in numeric fo
df['Destination'] = df['Destination'].str.replace('[0-9/-]', '', regex=True)
df['Destination'] = label_encoder.fit_transform(df['Destination'])

In [17]:
# Split the dataset into features and target variable
X = df.drop(['Transported'], axis=1)
y = df['Transported']

In [18]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [20]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [21]:
# Calculate accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on the test set:", accuracy)

Accuracy on the test set: 0.7859304084720121


In [22]:
# Generate classification report
classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_rep)

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.67      0.75       653
           1       0.74      0.90      0.81       669

    accuracy                           0.79      1322
   macro avg       0.80      0.78      0.78      1322
weighted avg       0.80      0.79      0.78      1322



In [23]:
# Read the test dataset
t_df = pd.read_csv('D:\\SensViz\\Machine Learning Task 5\\test.csv')

In [24]:
# Perform data preprocessing on the test dataset
t_df.dropna(inplace=True)
t_df = t_df.drop('Name', axis=1)
t_df['VIP'] = label_encoder.fit_transform(t_df['VIP'])
t_df['CryoSleep'] = label_encoder.fit_transform(t_df['CryoSleep'])
t_df["HomePlanet"] = t_df["HomePlanet"].str.lower().str.replace(' ', '')
t_df['HomePlanet'] = label_encoder.fit_transform(t_df['HomePlanet'])
t_df['Destination'] = t_df['Destination'].str.replace('[0-9/-]', '', regex=True)
t_df['Destination'] = label_encoder.fit_transform(t_df['Destination'])
t_df['PassengerId'] = t_df['PassengerId'].str.replace('_', '').astype(float)
t_df['Cabin'] = t_df['Cabin'].str.replace('[0-9/]', '', regex=True)
t_df['Cabin'] = label_encoder.fit_transform(t_df['Cabin'])

In [25]:
# Make predictions on the test dataset
y_pred_test = model.predict(t_df)

submission_df = pd.DataFrame({'PassengerId': t_df['PassengerId'], 'Transported': y_pred_test})

# Save the predictions to a CSV file
submission_df.to_csv('submission.csv', index=False)
submission_df.head()

Unnamed: 0,PassengerId,Transported
0,1301.0,1
1,1801.0,0
2,1901.0,1
3,2101.0,1
4,2301.0,1
