<a href="https://colab.research.google.com/github/NanPyaeNyeinThar/MyMachineLearning/blob/main/Classification/News_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [125]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import scipy.sparse as sp

# Load your dataset
df = pd.read_csv('https://raw.githubusercontent.com/NanPyaeNyeinThar/SML_Teaching/main/data/news_data.csv',  encoding='latin-1', header=None)
df.columns = ['class', 'news']
label_mapping = {'positive': 1, 'negative': -1, 'neutral': 0}
df['class'] = df['class'].map(label_mapping)

vectorizer = TfidfVectorizer() #max_features=1000

X = vectorizer.fit_transform(df['news'])
y = df['class']

print(X.shape)


(4846, 10070)


In [135]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                    test_size = 0.4,
                                    random_state=1)
X_test.shape

print("Training Data Shape:", X_train.shape)
print("Testing Data Shape:", X_test.shape)

Training Data Shape: (2907, 10070)
Testing Data Shape: (1939, 10070)


In [127]:
#--------------------------------------------------
## ----------- K-NN Classifier ------------------##
#--------------------------------------------------
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

steps = [('scaler', StandardScaler(with_mean=False)),
         ('knn', KNeighborsClassifier(n_neighbors = 3))]

knn_pipeline = Pipeline(steps)
knn_pipeline.fit(X_train, y_train)

ypred_train = knn_pipeline.predict(X_train)
mat_clf_train = confusion_matrix(y_train, ypred_train)
report_clf_train = classification_report(y_train, ypred_train)

print("Confusion Matrix for Training Data:")
print(mat_clf_train)
print("\nClassification Report for Training Data:")
print(report_clf_train)

ypred_test = knn_pipeline.predict(X_test)
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

Confusion Matrix for Training Data:
[[ 162    4  191]
 [ 120  283 1346]
 [  20    2  779]]

Classification Report for Training Data:
              precision    recall  f1-score   support

          -1       0.54      0.45      0.49       357
           0       0.98      0.16      0.28      1749
           1       0.34      0.97      0.50       801

    accuracy                           0.42      2907
   macro avg       0.62      0.53      0.42      2907
weighted avg       0.75      0.42      0.37      2907

[[  67    2  178]
 [  47   77 1006]
 [  42    4  516]]
              precision    recall  f1-score   support

          -1       0.43      0.27      0.33       247
           0       0.93      0.07      0.13      1130
           1       0.30      0.92      0.46       562

    accuracy                           0.34      1939
   macro avg       0.55      0.42      0.31      1939
weighted avg       0.68      0.34      0.25      1939



In [128]:
#--------------------------------------------------
## ----------- Logistic Regresion ------------------##
#--------------------------------------------------
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

steps = [('scaler', StandardScaler(with_mean=False)),
         ('logReg', LogisticRegression(penalty = "l2", C = 1.0, solver='saga', max_iter=5000))]

LR_pipeline = Pipeline(steps)
LR_pipeline.fit(X_train, y_train)

ypred_train = LR_pipeline.predict(X_train)
mat_clf_train = confusion_matrix(y_train, ypred_train)
report_clf_train = classification_report(y_train, ypred_train)

print("Confusion Matrix for Training Data:")
print(mat_clf_train)
print("\nClassification Report for Training Data:")
print(report_clf_train)

ypred_test = LR_pipeline.predict(X_test)
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)


Confusion Matrix for Training Data:
[[ 357    0    0]
 [   0 1749    0]
 [   0    0  801]]

Classification Report for Training Data:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       357
           0       1.00      1.00      1.00      1749
           1       1.00      1.00      1.00       801

    accuracy                           1.00      2907
   macro avg       1.00      1.00      1.00      2907
weighted avg       1.00      1.00      1.00      2907

[[125  81  41]
 [ 33 948 149]
 [ 23 239 300]]
              precision    recall  f1-score   support

          -1       0.69      0.51      0.58       247
           0       0.75      0.84      0.79      1130
           1       0.61      0.53      0.57       562

    accuracy                           0.71      1939
   macro avg       0.68      0.63      0.65      1939
weighted avg       0.70      0.71      0.70      1939



In [129]:
#--------------------------------------------------
## ------------ SVM Classifier ------------------##
#--------------------------------------------------
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

## Linear Kernel  ---------------
steps = [('scaler', StandardScaler(with_mean=False)),
         ('svc', SVC(kernel = 'linear',
                     class_weight='balanced'))]

svcL_pipeline = Pipeline(steps)
svcL_pipeline.fit(X_train, y_train)

ypred_train = svcL_pipeline.predict(X_train)
mat_clf_train = confusion_matrix(y_train, ypred_train)
report_clf_train = classification_report(y_train, ypred_train)

print("Confusion Matrix for Training Data:")
print(mat_clf_train)
print("\nClassification Report for Training Data:")
print(report_clf_train)

ypred_test = svcL_pipeline.predict(X_test)
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

Confusion Matrix for Training Data:
[[ 357    0    0]
 [   0 1749    0]
 [   0    0  801]]

Classification Report for Training Data:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       357
           0       1.00      1.00      1.00      1749
           1       1.00      1.00      1.00       801

    accuracy                           1.00      2907
   macro avg       1.00      1.00      1.00      2907
weighted avg       1.00      1.00      1.00      2907

[[122  75  50]
 [ 43 879 208]
 [ 41 241 280]]
              precision    recall  f1-score   support

          -1       0.59      0.49      0.54       247
           0       0.74      0.78      0.76      1130
           1       0.52      0.50      0.51       562

    accuracy                           0.66      1939
   macro avg       0.62      0.59      0.60      1939
weighted avg       0.65      0.66      0.66      1939



In [130]:
## Polynomial Kernel -----------------------
steps = [('scaler', StandardScaler(with_mean=False)),
         ('svc', SVC(kernel = 'poly', degree = 3,
                     class_weight='balanced'))]

svcPoly_pipeline = Pipeline(steps)
svcPoly_pipeline.fit(X_train, y_train)

ypred_train = svcPoly_pipeline.predict(X_train)
mat_clf_train = confusion_matrix(y_train, ypred_train)
report_clf_train = classification_report(y_train, ypred_train)

print("Confusion Matrix for Training Data:")
print(mat_clf_train)
print("\nClassification Report for Training Data:")
print(report_clf_train)

ypred_test = svcPoly_pipeline.predict(X_test)
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

Confusion Matrix for Training Data:
[[ 214  143    0]
 [   0 1747    2]
 [   0  269  532]]

Classification Report for Training Data:
              precision    recall  f1-score   support

          -1       1.00      0.60      0.75       357
           0       0.81      1.00      0.89      1749
           1       1.00      0.66      0.80       801

    accuracy                           0.86      2907
   macro avg       0.94      0.75      0.81      2907
weighted avg       0.88      0.86      0.85      2907

[[   5  241    1]
 [   2 1121    7]
 [   4  551    7]]
              precision    recall  f1-score   support

          -1       0.45      0.02      0.04       247
           0       0.59      0.99      0.74      1130
           1       0.47      0.01      0.02       562

    accuracy                           0.58      1939
   macro avg       0.50      0.34      0.27      1939
weighted avg       0.53      0.58      0.44      1939



In [131]:
## RBF Kernel -----------------------
steps = [('scaler', StandardScaler(with_mean=False)),
         ('svc', SVC(kernel = 'rbf', gamma = 'scale',
                     class_weight='balanced'))]

svcRBF_pipeline = Pipeline(steps)
svcRBF_pipeline.fit(X_train, y_train)

ypred_train = svcRBF_pipeline.predict(X_train)
mat_clf_train = confusion_matrix(y_train, ypred_train)
report_clf_train = classification_report(y_train, ypred_train)

print("Confusion Matrix for Training Data:")
print(mat_clf_train)
print("\nClassification Report for Training Data:")
print(report_clf_train)

ypred_test = svcRBF_pipeline.predict(X_test)
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

Confusion Matrix for Training Data:
[[ 357    0    0]
 [  19 1714   16]
 [  14    3  784]]

Classification Report for Training Data:
              precision    recall  f1-score   support

          -1       0.92      1.00      0.96       357
           0       1.00      0.98      0.99      1749
           1       0.98      0.98      0.98       801

    accuracy                           0.98      2907
   macro avg       0.96      0.99      0.97      2907
weighted avg       0.98      0.98      0.98      2907

[[ 119  104   24]
 [  22 1046   62]
 [  34  308  220]]
              precision    recall  f1-score   support

          -1       0.68      0.48      0.56       247
           0       0.72      0.93      0.81      1130
           1       0.72      0.39      0.51       562

    accuracy                           0.71      1939
   macro avg       0.71      0.60      0.63      1939
weighted avg       0.71      0.71      0.69      1939



In [132]:
#--------------------------------------------------
## ----------- Naive-Bayes Classifier ------------------##
#--------------------------------------------------
## GaussianNB  ---------------
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train.toarray(), y_train)

ypred_train = gnb.predict(X_train.toarray())
mat_clf_train = confusion_matrix(y_train, ypred_train)
report_clf_train = classification_report(y_train, ypred_train)

print("Confusion Matrix for Training Data:")
print(mat_clf_train)
print("\nClassification Report for Training Data:")
print(report_clf_train)

ypred_test = gnb.predict(X_test.toarray())
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

Confusion Matrix for Training Data:
[[ 357    0    0]
 [  48 1575  126]
 [  58    0  743]]

Classification Report for Training Data:
              precision    recall  f1-score   support

          -1       0.77      1.00      0.87       357
           0       1.00      0.90      0.95      1749
           1       0.86      0.93      0.89       801

    accuracy                           0.92      2907
   macro avg       0.88      0.94      0.90      2907
weighted avg       0.93      0.92      0.92      2907

[[100  89  58]
 [ 77 787 266]
 [ 67 260 235]]
              precision    recall  f1-score   support

          -1       0.41      0.40      0.41       247
           0       0.69      0.70      0.69      1130
           1       0.42      0.42      0.42       562

    accuracy                           0.58      1939
   macro avg       0.51      0.51      0.51      1939
weighted avg       0.58      0.58      0.58      1939



In [133]:
## MultinomialNB  ---------------
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB(alpha=1.0)
mnb.fit(X_train.toarray(), y_train)

ypred_train = mnb.predict(X_train.toarray())
mat_clf_train = confusion_matrix(y_train, ypred_train)
report_clf_train = classification_report(y_train, ypred_train)

print("Confusion Matrix for Training Data:")
print(mat_clf_train)
print("\nClassification Report for Training Data:")
print(report_clf_train)

ypred_test = mnb.predict(X_test.toarray())
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

Confusion Matrix for Training Data:
[[  14  255   88]
 [   0 1734   15]
 [   0  519  282]]

Classification Report for Training Data:
              precision    recall  f1-score   support

          -1       1.00      0.04      0.08       357
           0       0.69      0.99      0.81      1749
           1       0.73      0.35      0.48       801

    accuracy                           0.70      2907
   macro avg       0.81      0.46      0.46      2907
weighted avg       0.74      0.70      0.63      2907

[[   4  178   65]
 [   0 1119   11]
 [   1  424  137]]
              precision    recall  f1-score   support

          -1       0.80      0.02      0.03       247
           0       0.65      0.99      0.78      1130
           1       0.64      0.24      0.35       562

    accuracy                           0.65      1939
   macro avg       0.70      0.42      0.39      1939
weighted avg       0.67      0.65      0.56      1939



In [134]:
## BernoulliNB  ---------------
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB(alpha=1.0)
bnb.fit(X_train, y_train)

ypred_train = bnb.predict(X_train.toarray())
mat_clf_train = confusion_matrix(y_train, ypred_train)
report_clf_train = classification_report(y_train, ypred_train)

print("Confusion Matrix for Training Data:")
print(mat_clf_train)
print("\nClassification Report for Training Data:")
print(report_clf_train)

ypred_test = bnb.predict(X_test.toarray())
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

Confusion Matrix for Training Data:
[[  52  189  116]
 [   2 1729   18]
 [   0  333  468]]

Classification Report for Training Data:
              precision    recall  f1-score   support

          -1       0.96      0.15      0.25       357
           0       0.77      0.99      0.86      1749
           1       0.78      0.58      0.67       801

    accuracy                           0.77      2907
   macro avg       0.84      0.57      0.59      2907
weighted avg       0.79      0.77      0.74      2907

[[   9  133  105]
 [   2 1068   60]
 [   0  340  222]]
              precision    recall  f1-score   support

          -1       0.82      0.04      0.07       247
           0       0.69      0.95      0.80      1130
           1       0.57      0.40      0.47       562

    accuracy                           0.67      1939
   macro avg       0.69      0.46      0.45      1939
weighted avg       0.67      0.67      0.61      1939

