# MACHINE LEARNING 
 ### **Topics Included:**<br>
**1. Regression**
 * 1.1 Simple Linear Regression
 * 1.2 Multiple Linear Regression
 * 1.3 Polynomial Linear Regression
 * 1.4 Decision Tree Regressor
 * 1.5 Random Forest Regressor
 * 1.6 K-nearest neigbour Regressor
 * 1.7 Support Vector Regressor

 
**2. Classification**
 * 2.1 Logistic Regression
 * 2.2 Decision Tree Classifier
 * 2.3 Random Forest Classifier
 * 2.4 K-nearest neighbour
 * 2.5 Naive Bayes
 * 2.6 Support Vector Machine

----

## **1. Regression**

In [18]:
# Basic libraries
import numpy as np                            # linear algebra
import pandas as pd                          # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt            # data visualization library
import seaborn as sns                     # data visualization

# Model libraries
from sklearn.linear_model import LinearRegression          # Linear regression
from sklearn.preprocessing import PolynomialFeatures     # Polynomial features
from sklearn.tree import DecisionTreeRegressor          # Decision tree
from sklearn.ensemble import RandomForestRegressor     # Random forest
from sklearn.neighbors import KNeighborsRegressor     # KNN
from sklearn.svm import SVR                         # Support vector regression

# Data splitting (Train and Test) librarires
from sklearn.model_selection import train_test_split   # Splitting data into train and test sets

# Evaluation libraries
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error # Evaluation metrics for regression models

In [19]:
# Import datasets  
data = sns.load_dataset("titanic")
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [20]:
# clean data if you want  
# here i am just dropping the columns that i don't need also drpping the Nan values
data = data.drop(['deck', 'embark_town'], axis=1)
data= data.dropna()
data.shape

(712, 13)

In [21]:
# X is the features and y is the target variable

X = data[["survived", "pclass", "age"]]
y = data ["fare"]

In [22]:
# For Polynomail Regression Model
# poly = PolynomialFeatures(degree=2, include_bias=False)
# poly_features = poly.fit_transform(X)

# For Linear Regression Model
# poly_reg_model = LinearRegression()
# poly_reg_model.fit(poly_features, y)
# y_predicted = poly_reg_model.predict(poly_features)

In [23]:
# shorten the names of the imported libraries
lr = LinearRegression()
dt = DecisionTreeRegressor()
rf = RandomForestRegressor()
svr = SVR()
knn = KNeighborsRegressor()

In [24]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the models to the training data and predict the test data

for i in [lr, dt, rf, svr, knn]:   # read all models
    i.fit(X_train, y_train)        # fitting a model
    y_pred = i.predict(X_test)     # predict the test data with the model
    X_pred = i.predict(X_train)    # predict the train data with the model
    test_score = r2_score(y_test, y_pred)  # calculate the r2 score for the test data
    train_score = r2_score(y_train, X_pred) # calculate the r2 score for the train data
    if abs(train_score - test_score) <= 5: # if the difference between the train and test scores is less than 5 then print the model name and the scores
        print(i)
        print("Train score:", train_score)
        print("Test score:", test_score)
        print("R2 score:", test_score)
        print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
        print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
        print("SquareRoot Mean Squared Error:", mean_squared_error(y_test, y_pred, squared=False)) # RMSE 
        print("-"*50) # print a line to separate the models

LinearRegression()
Train score: 0.3387103577988051
Test score: 0.262816627569991
R2 score: 0.262816627569991
Mean Absolute Error: 25.28436550535477
Mean Squared Error: 3815.2338870351846
SquareRoot Mean Squared Error: 61.76757957889547
--------------------------------------------------
DecisionTreeRegressor()
Train score: 0.6289302811547886
Test score: 0.3496706293802059
R2 score: 0.3496706293802059
Mean Absolute Error: 23.181609111593534
Mean Squared Error: 3365.727911556324
SquareRoot Mean Squared Error: 58.01489387697201
--------------------------------------------------
RandomForestRegressor()
Train score: 0.6122720065156082
Test score: 0.35530929119361676
R2 score: 0.35530929119361676
Mean Absolute Error: 22.509328127900336
Mean Squared Error: 3336.5454660039477
SquareRoot Mean Squared Error: 57.7628381055151
--------------------------------------------------
SVR()
Train score: -0.09759445459538507
Test score: -0.1315079773918424
R2 score: -0.1315079773918424
Mean Absolute Error: 

## **2. Clasification**

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression # Logistic regression
from sklearn.tree import DecisionTreeClassifier      # Decision tree
from sklearn.ensemble import RandomForestClassifier # Random forest
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.naive_bayes import GaussianNB        # Naive Bayes
from sklearn import svm                         # Support vector machine

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report # Evaluation metrics for classification models

In [29]:
# Import dataset
tips = sns.load_dataset("tips")
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [30]:
X = tips.drop(labels="sex", axis=1) # features (X)
y = tips["sex"]                     # target variable (y)

In [31]:
from sklearn.preprocessing import LabelEncoder # Label encoder

X["smoker"] = LabelEncoder().fit_transform(X["smoker"])
X["day"] = LabelEncoder().fit_transform(X["day"])
X["time"] = LabelEncoder().fit_transform(X["time"])

In [32]:
lr_c = LogisticRegression()
dt_c = DecisionTreeClassifier()
rf_c = RandomForestClassifier(n_estimators=80) 
knn_c = KNeighborsClassifier(n_neighbors=5)
nb_c = GaussianNB()
svm_c = svm.SVC(kernel="linear")

In [41]:
# model loop for classification models 
import warnings
warnings.filterwarnings('ignore')  # ignore warnings

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the models to the training data and predict the test data 

for i in [lr_c, dt_c, rf_c, knn_c, nb_c, svm_c]: # read all models
    i.fit(X_train, y_train)                      # fitting a model
    y_pred = i.predict(X_test) 
    # predict the test data with the model
    cm = confusion_matrix(y_test, i.predict(X_test))
    print(i)
    print(cm)
    print(f"""
Score: {round(accuracy_score(y_test,y_pred),4)}
Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}
Classification Report: \n{classification_report(y_test, y_pred)}
    """)
    print(i,"classification_report")
    print("-"*60)

LogisticRegression()
[[ 2 17]
 [ 2 28]]

Score: 0.6122
Confusion Matrix: 
[[ 2 17]
 [ 2 28]]
Classification Report: 
              precision    recall  f1-score   support

      Female       0.50      0.11      0.17        19
        Male       0.62      0.93      0.75        30

    accuracy                           0.61        49
   macro avg       0.56      0.52      0.46        49
weighted avg       0.57      0.61      0.52        49

    
LogisticRegression() classification_report
------------------------------------------------------------
DecisionTreeClassifier()
[[ 8 11]
 [ 8 22]]

Score: 0.6122
Confusion Matrix: 
[[ 8 11]
 [ 8 22]]
Classification Report: 
              precision    recall  f1-score   support

      Female       0.50      0.42      0.46        19
        Male       0.67      0.73      0.70        30

    accuracy                           0.61        49
   macro avg       0.58      0.58      0.58        49
weighted avg       0.60      0.61      0.60        49
