# **Problem 2: Airplane crash**





In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV


# A.

In [2]:
data = pd.read_csv('crash.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Class,Name,Sex,Age,Ticket Price,Safety
0,0,1,Didn't Survive,Economy,"Braund, Mr. Owen Harris",male,22.0,7.25,0.336957
1,1,2,Survived,First Class,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,71.2833,0.553571
2,2,3,Survived,Economy,"Heikkinen, Miss. Laina",female,26.0,7.925,0.336957
3,3,4,Survived,First Class,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,53.1,0.336957
4,4,5,Didn't Survive,Economy,"Allen, Mr. William Henry",male,35.0,8.05,0.336957


In [3]:
# Check for missing values
missing_values = data.isnull().sum()
print("\nMissing values in each column:")
print(missing_values)


Missing values in each column:
Unnamed: 0        0
PassengerId       0
Survived          0
Class             0
Name              0
Sex               0
Age             177
Ticket Price      0
Safety            2
dtype: int64


In [4]:
# Fill missing values in the 'Age' and 'Safety' columns with their respective mean
data.fillna({'Age': data['Age'].mean(), 'Safety': data['Safety'].mean()}, inplace= True)

# Verify there are no more missing values
missing_values_after = data.isnull().sum()
print(missing_values_after)

Unnamed: 0      0
PassengerId     0
Survived        0
Class           0
Name            0
Sex             0
Age             0
Ticket Price    0
Safety          0
dtype: int64


# B.

In [5]:
# Selecting features (X) and target (y)
X = data.drop(columns=['Survived', 'Name', 'PassengerId', 'Unnamed: 0'])  # Features
y = data['Survived']  # Target

data.shape

(891, 9)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display sizes of train and test sets
print(f"Training set size: {X_train.shape}, Test set size: {X_test.shape}")

Training set size: (712, 5), Test set size: (179, 5)


# C.

In [7]:
# Convert categorical data to numeric 
encoder = LabelEncoder()
data['Class'] = encoder.fit_transform(data['Class'])  
data['Sex'] = encoder.fit_transform(data['Sex'])      
data['Survived'] = encoder.fit_transform(data['Survived']) 

X = data.drop(columns=['Survived', 'Name', 'PassengerId', 'Unnamed: 0'])  
y = data['Survived']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and train SVM model
model = SVC(kernel='linear', random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Accuracy: 0.7821
Precision: 0.7536
Recall: 0.7027


# D.

In [8]:
param_grid = {
    'C': [0.1, 1, 10],        
    'kernel': ['linear']  

    # 'C': [0.1, 1, 10, 100], 
	# 'gamma': [1, 0.1, 0.01, 0.001],  
	# 'kernel': ['linear', 'poly', 'rbf']   
}

model = SVC(random_state=42)

# perform Grid Search with Cross_Validation 
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score: {:.4f}".format(grid_search.best_score_))

best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test Accuracy with Best Parameters: {:.4f}".format(test_score))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ...............................C=0.1, kernel=linear; total time=   0.1s
[CV] END ...............................C=0.1, kernel=linear; total time=   4.2s
[CV] END ...............................C=0.1, kernel=linear; total time=   0.6s
[CV] END ...............................C=0.1, kernel=linear; total time=   0.1s
[CV] END ...............................C=0.1, kernel=linear; total time=   2.1s
[CV] END .................................C=1, kernel=linear; total time=  38.2s
[CV] END .................................C=1, kernel=linear; total time=  41.2s
[CV] END .................................C=1, kernel=linear; total time=  28.3s
[CV] END .................................C=1, kernel=linear; total time=  27.0s
[CV] END .................................C=1, kernel=linear; total time=  36.7s
[CV] END ................................C=10, kernel=linear; total time=  57.3s
[CV] END ................................C=10, ke

# E.

# F.

In [12]:
best_model = SVC(C=0.1, kernel='linear', gamma=1, random_state=42)  
# best_model = SVC(C=*, kernel='**', gamma=*, random_state=42)

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

# Evaluate the model's performance
test_accuracy = accuracy_score(y_test, y_pred) 
test_precision = precision_score(y_test, y_pred) 
test_recall = recall_score(y_test, y_pred)  

print("Test Accuracy with Best Parameters: {:.4f}".format(test_accuracy))
print("Test Precision with Best Parameters: {:.4f}".format(test_precision))
print("Test Recall with Best Parameters: {:.4f}".format(test_recall))

Test Accuracy with Best Parameters: 0.7821
Test Precision with Best Parameters: 0.7536
Test Recall with Best Parameters: 0.7027
