In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

## Install libraries

```bash
conda create -n edu4 python=3.11 jupyter matplotlib
```

```bash 
! pip install -U -r requirements.txt
```

```bash
! pip install -U numpy
! pip install -U scikit-learn
```

In [None]:
! ls

In [None]:
! pip install -U -r requirements.txt

## Update repository

In [None]:
! git pull

## Add import path

In [None]:
import os
import sys
import gc

In [None]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
del module_path

## Organize imports

In [None]:
import multiprocessing
from pathlib import Path

In [None]:
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (StandardScaler, PolynomialFeatures, 
                                   SplineTransformer, LabelEncoder)
from sklearn.feature_selection import (SequentialFeatureSelector as SFS, 
                                       SelectFromModel)
from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis, 
                                           QuadraticDiscriminantAnalysis)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVC, LinearSVC, SVR, LinearSVR
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import (mean_absolute_error, mean_squared_error, 
                             classification_report, confusion_matrix)

#### Number of CPU cores

In [None]:
workers = multiprocessing.cpu_count()
workers

## Initialize path

In [None]:
DATA = Path('data')
PATH = DATA / 'log_regr_lda_qda_np'
LR_PATH = DATA / 'linear_regression'
PUMPKIN_DIR = PATH / 'Pumpkin_Seeds_Dataset'
IRIS_DIR = PATH / 'iris'
PUMPKIN_DIR.mkdir(exist_ok=True, parents=True)
IRIS_DIR.mkdir(exist_ok=True, parents=True)

In [None]:
! ls

## Load IRIS dataset

In [None]:
SEED = 2024

In [None]:
iris_url = 'https://www.kaggle.com/datasets/uciml/iris/download?datasetVersionNumber=2'

#### Load dataset

In [None]:
! ls {IRIS_DIR}

In [None]:
df = pd.read_csv(IRIS_DIR / 'Iris.csv')

In [None]:
df

le = LabelEncoder()

iris_labels = le.fit_transform(df.Species)

df.loc[:, 'labels'] = iris_labels

df

In [None]:
y = df['labels']
X = df.drop(columns=['Id', 'Species', 'labels'], axis=1)
X.shape, y.shape, df['Species'].value_counts()

In [None]:
le = LabelEncoder()

iris_labels = le.fit_transform(df.Species)

df.loc[:, 'labels'] = iris_labels

y = df['labels']
X = df.drop(columns=['Id', 'Species', 'labels'], axis=1)
X.shape, y.shape, df['Species'].value_counts()

In [None]:
y = df['labels']
X = df.drop(columns=['Id', 'Species', 'labels'], axis=1)
X.shape, y.shape, df['Species'].value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED)

In [None]:
y_train

## Scaling / standartizing the parameters

In [None]:
scaler = StandardScaler().fit(X_train)

In [None]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Classification trees

In [None]:
cls_svc = SVC(C=1.0, 
              kernel='linear', 
              verbose=True,
              decision_function_shape='ovr',
              random_state=2022)

In [None]:
cls_lin = LinearSVC(C=1.0, 
                    verbose=True,
                    random_state=2022,
                    max_iter=10000)

In [None]:
cls_rbf = SVC(C=1.0, 
              kernel='rbf', 
              verbose=True,
              decision_function_shape='ovr',
              random_state=2022)

## Train models

In [None]:
cls_svc = cls_svc.fit(X_train, y_train)
cls_svc

In [None]:
y_pred = cls_svc.predict(X_test)

In [None]:
cr = classification_report(y_test, y_pred)
print(cr)

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')

In [None]:
cls_lin = cls_lin.fit(X_train, y_train)
cls_lin

In [None]:
y_pred = cls_lin.predict(X_test)

In [None]:
cr = classification_report(y_test, y_pred)
print(cr)

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')

In [None]:
cls_rbf = cls_rbf.fit(X_train, y_train)
cls_rbf

In [None]:
y_pred = cls_rbf.predict(X_test)

In [None]:
cr = classification_report(y_test, y_pred)
print(cr)

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')

## Classification for high dimensional features

Classification of the hig dimensional feature with kernel

#### Initialize model

In [None]:
cls_ker = SVC(C=1.0, 
              kernel='rbf', 
              verbose=True,
              decision_function_shape='ovr',
              random_state=SEED)

#### Prepare data

In [None]:
SEED = 2024

In [None]:
studen_scores_path = PUMPKIN_DIR / 'Pumpkin_Seeds_Dataset.xlsx'

In [None]:
df = pd.read_excel(studen_scores_path)
df

In [None]:
df.shape

## Data analysis

In [None]:
df['Class'].value_counts() 

In [None]:
df.describe().T

In [None]:
y = df['Class']
X = df.drop(columns=['Class'], axis=1)

In [None]:
y = y.replace('Çerçevelik', 0).replace('Ürgüp Sivrisi', 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.25, 
                                                    random_state=SEED)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

#### Scaling Data

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Fit the model

In [None]:
cls_ker = cls_ker.fit(X_train, y_train)
cls_ker

In [None]:
y_pred = cls_ker.predict(X_test)

In [None]:
cr = classification_report(y_test, y_pred)
print(cr)

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')

In [None]:
error = cm[0][1] + cm[1][0]
corrc = cm[0][0] + cm[1][1]

In [None]:
error / (error + corrc)

## Support vector Regression

There are also support vector regression models

#### Initialize the model

In [None]:
svr_lin = LinearSVR(C=1.0, 
              verbose=True,
              max_iter=10000,
              random_state=2022)

svr_rbf = SVR(C=100.0, 
              kernel='rbf', 
              verbose=True)

#### Prepare the data

In [None]:
pertol_consumption_path = LR_PATH / 'petrol_consumption.csv'

In [None]:
df = pd.read_csv(pertol_consumption_path)
df

In [None]:
df.shape

In [None]:
print(df.describe().round(2).T)

#### Prepare dataset

In [None]:
y = df['Petrol_Consumption']
X = df[['Average_income', 'Paved_Highways',
       'Population_Driver_licence(%)', 'Petrol_tax']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=SEED)

#### Scale the parameters

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Fit the models

In [None]:
svr_lin = svr_lin.fit(X_train, y_train)

In [None]:
y_pred = svr_lin.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')

In [None]:
svr_rbf = svr_rbf.fit(X_train, y_train)

In [None]:
y_pred = svr_rbf.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')