<a href="https://colab.research.google.com/github/NanPyaeNyeinThar/MyMachineLearning/blob/main/Classification/News_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import scipy.sparse as sp

# Load your dataset
df = pd.read_csv('https://raw.githubusercontent.com/NanPyaeNyeinThar/SML_Teaching/main/data/news_data.csv',  encoding='latin-1', header=None)
df.columns = ['class', 'news']
label_mapping = {'positive': 1, 'negative': -1, 'neutral': 0}
df['class'] = df['class'].map(label_mapping)

vectorizer = TfidfVectorizer() #max_features=1000

X = vectorizer.fit_transform(df['news'])
y = df['class']

print(X.shape)


(4846, 10070)


In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                    test_size = 0.40,
                                    random_state=1)
X_test.shape

(1939, 10070)

In [40]:
#--------------------------------------------------
## ----------- K-NN Classifier ------------------##
#--------------------------------------------------
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

steps = [('scaler', StandardScaler(with_mean=False)),
         ('knn', KNeighborsClassifier(n_neighbors = 3))]

knn_pipeline = Pipeline(steps)
knn_pipeline.fit(X_train, y_train)

ypred_test = knn_pipeline.predict(X_test)
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

[[  67    2  178]
 [  47   77 1006]
 [  42    4  516]]
              precision    recall  f1-score   support

          -1       0.43      0.27      0.33       247
           0       0.93      0.07      0.13      1130
           1       0.30      0.92      0.46       562

    accuracy                           0.34      1939
   macro avg       0.55      0.42      0.31      1939
weighted avg       0.68      0.34      0.25      1939



In [41]:
#--------------------------------------------------
## ----------- Logistic Regresion ------------------##
#--------------------------------------------------
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

steps = [('scaler', StandardScaler(with_mean=False)),
         ('logReg', LogisticRegression(penalty = "l2", C = 1.0, solver='saga', max_iter=5000))]

LR_pipeline = Pipeline(steps)
LR_pipeline.fit(X_train, y_train)

ypred_test = LR_pipeline.predict(X_test)
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)


[[125  81  41]
 [ 33 950 147]
 [ 23 238 301]]
              precision    recall  f1-score   support

          -1       0.69      0.51      0.58       247
           0       0.75      0.84      0.79      1130
           1       0.62      0.54      0.57       562

    accuracy                           0.71      1939
   macro avg       0.68      0.63      0.65      1939
weighted avg       0.70      0.71      0.70      1939



In [42]:
#--------------------------------------------------
## ------------ SVM Classifier ------------------##
#--------------------------------------------------
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

## Linear Kernel  ---------------
steps = [('scaler', StandardScaler(with_mean=False)),
         ('svc', SVC(kernel = 'linear',
                     class_weight='balanced'))]

svcL_pipeline = Pipeline(steps)
svcL_pipeline.fit(X_train, y_train)

ypred_test = svcL_pipeline.predict(X_test)
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

[[122  75  50]
 [ 43 879 208]
 [ 41 241 280]]
              precision    recall  f1-score   support

          -1       0.59      0.49      0.54       247
           0       0.74      0.78      0.76      1130
           1       0.52      0.50      0.51       562

    accuracy                           0.66      1939
   macro avg       0.62      0.59      0.60      1939
weighted avg       0.65      0.66      0.66      1939



In [43]:
## Polynomial Kernel -----------------------
steps = [('scaler', StandardScaler(with_mean=False)),
         ('svc', SVC(kernel = 'poly', degree = 3,
                     class_weight='balanced'))]

svcPoly_pipeline = Pipeline(steps)
svcPoly_pipeline.fit(X_train, y_train)

ypred_test = svcPoly_pipeline.predict(X_test)
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

[[   5  241    1]
 [   2 1121    7]
 [   4  551    7]]
              precision    recall  f1-score   support

          -1       0.45      0.02      0.04       247
           0       0.59      0.99      0.74      1130
           1       0.47      0.01      0.02       562

    accuracy                           0.58      1939
   macro avg       0.50      0.34      0.27      1939
weighted avg       0.53      0.58      0.44      1939



In [44]:
## RBF Kernel -----------------------
steps = [('scaler', StandardScaler(with_mean=False)),
         ('svc', SVC(kernel = 'rbf', gamma = 'scale',
                     class_weight='balanced'))]

svcRBF_pipeline = Pipeline(steps)
svcRBF_pipeline.fit(X_train, y_train)

ypred_test = svcRBF_pipeline.predict(X_test)
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

[[ 119  104   24]
 [  22 1046   62]
 [  34  308  220]]
              precision    recall  f1-score   support

          -1       0.68      0.48      0.56       247
           0       0.72      0.93      0.81      1130
           1       0.72      0.39      0.51       562

    accuracy                           0.71      1939
   macro avg       0.71      0.60      0.63      1939
weighted avg       0.71      0.71      0.69      1939



In [45]:
#--------------------------------------------------
## ----------- Naive-Bayes Classifier ------------------##
#--------------------------------------------------
## GaussianNB  ---------------
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train.toarray(), y_train)

ypred_test = gnb.predict(X_test.toarray())
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

[[100  89  58]
 [ 77 787 266]
 [ 67 260 235]]
              precision    recall  f1-score   support

          -1       0.41      0.40      0.41       247
           0       0.69      0.70      0.69      1130
           1       0.42      0.42      0.42       562

    accuracy                           0.58      1939
   macro avg       0.51      0.51      0.51      1939
weighted avg       0.58      0.58      0.58      1939



In [46]:
## MultinomialNB  ---------------
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB(alpha=1.0)
mnb.fit(X_train.toarray(), y_train)

ypred_test = mnb.predict(X_test.toarray())
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

[[   4  178   65]
 [   0 1119   11]
 [   1  424  137]]
              precision    recall  f1-score   support

          -1       0.80      0.02      0.03       247
           0       0.65      0.99      0.78      1130
           1       0.64      0.24      0.35       562

    accuracy                           0.65      1939
   macro avg       0.70      0.42      0.39      1939
weighted avg       0.67      0.65      0.56      1939



In [47]:
## BernoulliNB  ---------------
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB(alpha=1.0)
bnb.fit(X_train, y_train)

ypred_test = mnb.predict(X_test.toarray())
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

[[   4  178   65]
 [   0 1119   11]
 [   1  424  137]]
              precision    recall  f1-score   support

          -1       0.80      0.02      0.03       247
           0       0.65      0.99      0.78      1130
           1       0.64      0.24      0.35       562

    accuracy                           0.65      1939
   macro avg       0.70      0.42      0.39      1939
weighted avg       0.67      0.65      0.56      1939

