## Training SVM

In [1]:
# Importing dependencies
from path import Path
from sqlalchemy import create_engine
from config import db_password
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from matplotlib import pyplot as plt
from sklearn import svm
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import confusion_matrix

In [2]:
# Read in the dataset after preprocessing and feature extraction
data = Path('features.csv')
features_df = pd.read_csv(data)
features_df.head()

Unnamed: 0.1,Unnamed: 0,key,letters,form,upos,xpos,FormWithoutLemma,Counts,MorphemeSeparated,index1,prefix,vowel,morpheme boundary,noun,verb,letter_label_encoded,prefix_label_encoded
0,0,0,ச,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,[],0,0,1,0,40,11374
1,1,0,ெ,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,['ச'],1,0,1,0,65,5375
2,2,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"['ச', 'ெ']",0,1,1,0,47,5318
3,3,0,்,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"['ச', 'ெ', 'ன']",1,0,1,0,71,5065
4,4,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"['ச', 'ெ', 'ன', '்']",0,0,1,0,47,5064


In [3]:
# Setting features and target
y = features_df["morpheme boundary"]
X = features_df.drop(columns=['Unnamed: 0','key', 'letters','form', 'MorphemeSeparated', 'morpheme boundary', 'prefix', 'upos', 'xpos','FormWithoutLemma','Counts'])
y=y.astype('int')

In [4]:
X.head()

Unnamed: 0,index1,vowel,noun,verb,letter_label_encoded,prefix_label_encoded
0,0,0,1,0,40,11374
1,0,1,1,0,65,5375
2,0,0,1,0,47,5318
3,0,1,1,0,71,5065
4,0,0,1,0,47,5064


In [5]:
# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Split testing and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(47124, 6)

In [6]:
model = SVC(kernel='rbf', gamma = 0.8, C=1)

In [7]:
model.fit(X_train, y_train)

SVC(C=1, gamma=0.8)

In [8]:
y_pred = model.predict(X_test)
results = pd.DataFrame({
"Prediction": y_pred,
"Actual": y_test
}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,0,1
1,0,0
2,0,0
3,1,0
4,1,1


In [9]:
acc_score = balanced_accuracy_score(y_test, y_pred)
(acc_score)*100

76.05529650972424

In [10]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.82      0.81      0.71      0.81      0.76      0.58      9642
          1       0.70      0.71      0.81      0.71      0.76      0.57      6067

avg / total       0.77      0.77      0.75      0.77      0.76      0.58     15709



In [11]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,7820,1822
Actual 1,1759,4308
