Import libraries

In [19]:
import numpy as np
import pandas as pd
import sklearn.datasets
import sklearn.linear_model
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LassoCV, LogisticRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.mode.chained_assignment = None

# Regression task: California housing dataset

In [17]:
# Create pandas dataframe from dataset
trainData = pd.read_csv('housing_coursework_train.csv')
testData = pd.read_csv('housing_coursework_test.csv')
frames = [trainData, testData]
data = pd.concat(frames)

# Separate features from target variable
features=['longitude',	'latitude',	'housing_median_age',	'total_rooms',	'total_bedrooms',
          'population',	'households',	'median_income', 'ocean_proximity']
X_raw = data[features]
y_raw = data['median_house_value']

# Describe numerical and categorical columns
display(X_raw.select_dtypes(include=np.number).describe())
display(X_raw.select_dtypes(exclude=np.number).describe())

# Drop unnessecary columns
# X_raw = X_raw.drop(columns=['No.']) # Remove the No. column using '.drop' (no longer included in features)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
count,1020.0,1020.0,1020.0,1020.0,1011.0,1020.0,1020.0,1020.0
mean,-119.562431,35.656882,27.623529,2732.830392,556.577646,1474.960784,515.614706,3.955444
std,1.970947,2.143429,12.311122,2168.037719,423.168029,1116.843167,382.273122,1.957958
min,-124.19,32.56,2.0,19.0,11.0,34.0,9.0,0.536
25%,-121.62,33.91,17.0,1482.0,301.5,807.5,287.75,2.607425
50%,-118.53,34.28,28.0,2206.5,452.0,1204.0,427.0,3.69075
75%,-118.0375,37.71,36.0,3260.0,672.5,1815.75,626.5,4.8568
max,-115.41,41.78,52.0,27700.0,4386.0,15037.0,4072.0,15.0001


Unnamed: 0,ocean_proximity
count,1020
unique,4
top,<1H OCEAN
freq,466


In [14]:
# Print numerical outliers
print(X_raw[(X_raw['total_rooms'] < 0) | (X_raw['total_rooms'] > 7500) |
            (X_raw['total_bedrooms'] < 0) | (X_raw['total_bedrooms'] > 1250) |
            (X_raw['population'] < 0) | (X_raw['population'] > 3000) | 
            (X_raw['households'] < 0) | (X_raw['households'] > 2000) | 
            (X_raw['median_income'] < 0) | (X_raw['median_income'] > 10)])

# Remove numerical outliers
# X_train_raw.loc[X_train_raw.total_rooms>1000,'total_rooms'] = np.nan  # Set total_rooms over 3500 to be NaN
# X_train_raw.loc[X_train_raw.total_bedrooms>900,'total_bedrooms'] = np.nan  # Set total_bedrooms over 900 to be NaN

# Print categorical outliers
print(pd.unique(X_raw['ocean_proximity']))
print(pd.unique(X_raw['ocean_proximity']))

     longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
14     -122.24     37.72                   5        18634          2885.0   
34     -122.01     37.57                  14        16199          2993.0   
35     -122.05     37.57                   7        10648          1818.0   
39     -121.92     37.72                   4         7477          1576.0   
47     -120.97     38.42                  16         1748           322.0   
..         ...       ...                 ...          ...             ...   
199    -119.17     34.16                  17         5276          1020.0   
201    -119.04     34.24                  20         7794          1192.0   
203    -119.06     34.22                  13         4175          1321.0   
207    -118.69     34.18                  11         1177           138.0   
208    -118.83     34.33                   6         6679          1164.0   

     population  households  median_income ocean_proximity  
14         742

In [23]:
# Split data into seperate numerical and categorical data
X_numerical = X_raw.select_dtypes(include=np.number)
X_categorical = X_raw.select_dtypes(exclude=np.number)

# Create our imputer objects
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Fit the imputers on the testing data and transform column
numeric_imputer.fit(X_numerical)
categorical_imputer.fit(X_categorical)
X_numerical_imp = numeric_imputer.transform(X_numerical)  
X_categorical_imp = categorical_imputer.transform(X_categorical)

# Feature scaling 
scaler = MinMaxScaler()
scaler.fit(X_numerical_imp)  # Fit and transform numeric data
X_num_sca = scaler.transform(X_numerical_imp)

# Feature encoder
encoder = OneHotEncoder(handle_unknown ='ignore', sparse_output=False)
encoder.fit(X_categorical_imp)  # Fit and transform categorical data
X_onehot = encoder.transform(X_categorical_imp)

# Concatenate scalled data
X = np.concatenate([X_num_sca, X_onehot], axis=1)

print(X)

TypeError: __init__() got an unexpected keyword argument 'sparse_output'

In [10]:
# Split the data into non-test/test data
X_train, X_test_validate, y_train, y_test_validate = train_test_split(X, y_raw, test_size=0.20, shuffle=True, random_state=0)
X_test, X_validate, y_test, y_validate = train_test_split(X_test_validate, y_test_validate, test_size=0.50, shuffle=True, random_state=0)

NameError: name 'X' is not defined

In [24]:
# Linear regression [USE VALIDATION]
linear_obj = sklearn.linear_model.LinearRegression(fit_intercept=True)  # Create object
linear_obj.fit(X_train, y_train)  # Train the model using the training sets
linear_y_pred = linear_obj.predict(X_test)  # Make predictions using the testing set
# plt.scatter(y_test, linear_y_pred)  # Scatter
print(linear_obj.score(X_train, y_train))
print(linear_obj.score(X_validate, y_validate))

# Fine tuning
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = dict(alpha=alpha)
grid = GridSearchCV(estimator=lasso, param_grid=param_grid, scoring='r2', verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)


# # Lasso
# lasso_obj = Lasso(alpha=1.0)  
# # lasso_obj.fit(X_train, y_train)
# lasso_obj = LassoCV(alphas = [0.0001, 0.001,0.01, 0.1, 1, 10]).fit(X_train, y_train)
# lasso_y_pred= lasso_obj.predict(X_test)  
# # plt.scatter(y_test, lasso_y_pred)
# print(lasso_obj.score(X_train, y_train))
# print(lasso_obj.score(X_test, y_test))

NameError: name 'X_train' is not defined

In [None]:
# Mean Squared Error(MSE)
print("Linear MSE",mean_squared_error(y_test, linear_y_pred))
print("Lasso MSE",mean_squared_error(y_test, lasso_y_pred))

# Mean Absolute Error (MAE)
print("Linear MAE",mean_absolute_error(y_test,linear_y_pred))
print("Lasso MAE",mean_absolute_error(y_test,lasso_y_pred))

# Root Mean Squared Error(RMSE)
print("Linear RMSE",np.sqrt(mean_squared_error(y_test,linear_y_pred)))
print("Lasso RMSE",np.sqrt(mean_squared_error(y_test,lasso_y_pred)))

# # Root Mean Squared Log Error(RMSLE)
# print("Linear RMSE",np.log(np.sqrt(mean_squared_error(y_test,linear_y_pred))))
# print("Lasso RMSE",np.log(np.sqrt(mean_squared_error(y_test,lasso_y_pred))))

# R Squared (R2)
print("Linear R2",r2_score(y_test,linear_y_pred))
print("Lasso R2",r2_score(y_test,lasso_y_pred))

# Adjusted R Squared


Data visualisation

In [None]:
# Data visualisation
sns.pairplot(data=testData, diag_kind='kde')

# Heatmap
sns.heatmap(testData[['longitude',	'latitude',	'housing_median_age',	'total_rooms',	'total_bedrooms',	'population',	'households',	'median_income', 'median_house_value']].corr(), cmap='Blues', annot=True)
# plt.show()

In [None]:
# Feature selection (Do not implement)
X_raw = SelectKBest(f_regression, k = 3).fit_transform(X_raw, y_raw)

# Classification task: Titanic dataset


In [None]:
# Load data
trainData = pd.read_csv('/content/sample_data/Titanic_train.csv')
testData = pd.read_csv('/content/sample_data/Titanic_test.csv')
frames = [trainData, testData]
data = pd.concat(frames)
display(data)

# Allocate features and target column
features=['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked']
X_raw = data[features]
y_raw = data['Target: Survived']  # Target variable
# X_raw = X_raw.drop(columns=['PassengerId']) # Remove the PassengerId column using '.drop'

display(X_raw.head())
print(X_raw.shape)

In [None]:
# Identify outliers
display(X_raw.select_dtypes(include=np.number).describe()) # display numeric colmuns
display(X_raw.select_dtypes(exclude=np.number).describe()) # describe numeric columns for obvious outliers (missing ticket number and embarked location)

# Look at the numerical columns and see if there are any obvious outliers - no outliers found in numerical columns (besides NaN)
display(X_raw.select_dtypes(include=np.number).describe())
print(X_raw[(X_raw['Pclass'] < 1) | (X_raw['Pclass'] > 3) | 
            (X_raw['Age'] < 0) | (X_raw['Age'] > 100) | 
            (X_raw['SibSp'] < 0) | (X_raw['SibSp'] > 10) |
            (X_raw['Parch'] < 0) | (X_raw['Parch'] > 10) |
            (X_raw['Fare'] < 0) | (X_raw['Fare'] > 250)])

# Overwrite outliers - none (besides NaN)

# Print unique values
print(pd.unique(trainData['Sex']))
print(pd.unique(trainData['Embarked']))

In [None]:
# Split data into seperate numerical and categorical data
X_numerical = X_raw.select_dtypes(include=np.number)
X_categorical = X_raw.select_dtypes(exclude=np.number)

# Create our imputer objects
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Fit the imputers on the testing data and transform column
numeric_imputer.fit(X_numerical)
categorical_imputer.fit(X_categorical)
X_numerical_imp = numeric_imputer.transform(X_numerical)  
X_categorical_imp = categorical_imputer.transform(X_categorical)

# Feature scaling 
scaler = MinMaxScaler()
scaler.fit(X_numerical_imp)  # Fit and transform numeric data
X_num_sca = scaler.transform(X_numerical_imp)

# Feature encoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit(X_categorical_imp)  # Fit and transform categorical data
X_onehot = encoder.transform(X_categorical_imp)

# Concatenate scalled data
X = np.concatenate([X_num_sca, X_onehot], axis=1)

print(X)

In [None]:
# Split the data into non-test/test data
X_train, X_test, y_train, y_test = train_test_split(X, y_raw, test_size=0.20, shuffle=True, random_state=0)

In [None]:
# Implement classification models (ligistic & SVM)
logistic_obj = LogisticRegression()
logistic_obj.fit(X_train, y_train)
logistic_pred = logistic_obj.predict(X_test)
# Use score method to get accuracy of model
score = logistic_obj.score(X_test, y_test)
print(score)

# SVM
SVM_obj = SVC(kernel='linear')
SVM_obj.fit(X_train, y_train)
SVM_pred = SVM_obj.predict(X_test)
accuracy = accuracy_score(y_test, SVM_pred)
print(accuracy)

In [None]:
# Evaluation metrics
logistic_accuracy = accuracy_score(y_test, logistic_pred)
logistic_precision = precision_score(y_test, logistic_pred)
logistic_recall = recall_score(y_test, logistic_pred)
logistic_f1score = f1_score(y_test, logistic_pred)

SVM_accuracy = accuracy_score(y_test, SVM_pred)
SVM_precision = precision_score(y_test, SVM_pred)
SVM_recall = recall_score(y_test, SVM_pred)
SVM_f1score = f1_score(y_test, SVM_pred)

print(f"Logistic Accuracy = {logistic_accuracy.round(4)}")
print(f"SVM Accuracy = {SVM_accuracy.round(4)}")

print(f"Logistic Precision = {logistic_precision.round(4)}")
print(f"SVM Precision = {SVM_precision.round(4)}")

print(f"Logistic Recall = {logistic_recall.round(4)}")
print(f"SVM Recall = {SVM_recall.round(4)}")

print(f"Logistic F1 Score = {logistic_f1score.round(4)}")
print(f"SVM F1 Score = {SVM_f1score.round(4)}")

# Generate confusion matrix for the predictions
logistic_conf_matrix = confusion_matrix(y_test, logistic_pred)
print("Logistic Confsion Matrix:")
logistic_conf_matrix

SVM_conf_matrix = confusion_matrix(y_test, SVM_pred)
print("SVM Confsion Matrix:")
SVM_conf_matrix