In [None]:
import os
import string
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import pickle
from spacy import displacy
from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA, LEMMA
from spacy.tokens import Doc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

nlp = spacy.load('en_core_web_lg')

### Read Data

In [None]:
root_folder = r'/kaggle/input/nlp-project-fcis-23/20news-19997/20_newsgroups'
sub_folders = os.listdir(root_folder)
len(sub_folders)

In [None]:
sub_folders

In [None]:
def read_data(root_folder, sub_folders):

    data, labels = [], []
    
    for folder in sub_folders:
        path = os.path.join(root_folder, folder)
        samples = os.listdir(path)

        for sample in samples:
            sample_path = os.path.join(path, sample)
            text = open(sample_path, encoding='iso-8859-1')   # 'latin-1')
            data.append(text.read())
            labels.append(folder)
    
    return data, labels

In [None]:
data, labels = read_data(root_folder, sub_folders)

print(len(data))
print(len(labels))

In [None]:
print(labels[1200])
print('-' * 50 , end='\n\n')
print(data[1200])

### Some Visualization Methods

In [None]:
print(labels[1200])

# Create a WordCloud object
wordcloud = WordCloud(width=800, height=800, background_color='white').generate(data[1200])

# Display the word cloud
plt.figure(figsize=(8,8))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [6]:
print(labels[1200])
print('-' * 50 , end='\n\n')
displacy.render(nlp(data[1200]), style = "ent",jupyter = True)

talk.politics.mideast
--------------------------------------------------



In [7]:
 displacy.render(nlp(data[1200]), style = "dep",jupyter = True, options={'distance': 80,})
# displacy.render(nlp(data[1200]), style="dep", options={"compact": True, "bg": "#09a3d5",
#                                            'distance': 80,"color": "white",
#                                            "font": "Source Sans Pro"})

### Clean Header

In [None]:
def clean_header(text):
    text = re.sub(r'(From:\s+[^\n]+\n)', '', text)
    text = re.sub(r'(Subject:[^\n]+\n)', '', text)
    text = re.sub(r'(([\sA-Za-z0-9\-]+)?[A|a]rchive-name:[^\n]+\n)', '', text)
    text = re.sub(r'(Last-modified:[^\n]+\n)', '', text)
    text = re.sub(r'(Version:[^\n]+\n)', '', text)

    return text

### Clean Text

In [None]:
re_url = re.compile(r'(?:http|ftp|https)://(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?')
re_email = re.compile('(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])')                 

def clean_text(text):        
    text = text.lower()
    text = text.strip()
    text = re.sub(re_url, '', text)
    text = re.sub(re_email, '', text)
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub(r'(\d+)', ' ', text)
    text = re.sub(r'(\s+)', ' ', text)
    
    return text

In [None]:
for i in range(len(data)):
    data[i] = clean_text(clean_header(data[i]))

### Tokenaization & Remove Stopwords
- stopwords tokens can be removed from a Doc object by creating a new Doc object without the unwanted tokens.

- You can remove tokens by converting doc to numpy array, removing from numpy array and then converting back to doc.

In [None]:
def remove_stopwords(doc):
    indexes = []
    for index, token in enumerate(doc):
        if token.is_stop:
            indexes.append(index)
    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA, LEMMA])
    np_array = np.delete(np_array, indexes, axis = 0)
    doc2 = Doc(doc.vocab, words=[t.text for i, t in enumerate(doc) if i not in indexes])
    doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA, LEMMA], np_array)
    return doc2

In [None]:
Docs = []
for i in range(len(data)):
    Docs.append(remove_stopwords(nlp(data[i])))
    if i%1000 == 0:
        print(i)

### Lemmatization

In [None]:
str(Docs[0])

In [None]:
lemmas_ = ' '.join(token.lemma_ for token in Docs[0])
lemmas_

In [None]:
final_data = []
for i in range(len(Docs)):
    lemmas_ = ' '.join(token.lemma_ for token in Docs[i])
    final_data.append(lemmas_)
    if i%1000 == 0:
        print(i)

### TF-IDF

In [None]:
vectorizer = TfidfVectorizer(analyzer='word', max_features=5000)
vectors = vectorizer.fit_transform(final_data)

In [None]:
vectors.shape

In [None]:
vectors.data

### Split Data

In [None]:
le = LabelEncoder()
encoded_labels = le.fit_transform(labels)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(vectors, encoded_labels, test_size=0.3, stratify=labels, shuffle=True)

In [None]:
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

## Name of labels

In [None]:
names = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
       'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
       'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles',
       'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
       'sci.electronics', 'sci.med', 'sci.space',
       'soc.religion.christian', 'talk.politics.guns',
       'talk.politics.mideast', 'talk.politics.misc',
       'talk.religion.misc']

# RandomForest ---> Best Till Now

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, random_state=42,)
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_val)
print(y_pred[:24])
print(y_val[:24])

In [None]:
cm = confusion_matrix(y_val,y_pred)
plot_confusion_matrix(cm,class_names=names,figsize=(12,5))

In [None]:
print(f"Train Accuracy: {rf.score(X_train, y_train)*100:.3f}%")
print(f"Test Accuracy: {rf.score(X_val, y_val)*100:.3f}%")

In [None]:
pickle.dump(rf, open('RandomForestModel.pkl', 'wb'))

# DecisionTreeClassifier 

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()

In [None]:
dt.fit(X_train,y_train)

In [None]:
train_score_dt = dt.score(X_train,y_train)

In [None]:
test_score_dt = dt.score(X_val,y_val)

In [None]:
ytest_pred_dt = dt.predict(X_val)

In [None]:
conf_mat_dt =confusion_matrix(y_val,ytest_pred_dt)
plot_confusion_matrix(conf_mat_dt,class_names=names,figsize=(12,5))

In [None]:
print(f"Train Accuracy: {train_score_dt*100:.3f}%")
print(f"Test Accuracy: {test_score_dt*100:.3f}%")

# XGB Classifier

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(objective= 'binary:RandomForest', learning_rate=0.04, n_estimators = 200)

In [None]:
xgb.fit(X_train,y_train)

In [None]:
train_score_xgb = xgb.score(X_train,y_train)

In [None]:
test_score_xgb = xgb.score(X_val,y_val)

In [None]:
ytest_pred_xgb = xgb.predict(X_val)

In [None]:
conf_mat_xgb =confusion_matrix(y_val,ytest_pred_xgb)
plot_confusion_matrix(conf_mat_xgb,class_names=names,figsize=(12,5))

In [None]:
print(f"Train Accuracy: {train_score_xgb*100:.3f}%")
print(f"Test Accuracy: {test_score_xgb*100:.3f}%")

In [None]:
pickle.dump(xgb, open('XGBClassifierModel.pkl', 'wb'))

# AdaBoost Classifier 

In [None]:
from sklearn.ensemble import AdaBoostClassifier

abc = AdaBoostClassifier(learning_rate=0.02, n_estimators=100,estimator=RandomForestClassifier())

In [None]:
abc.fit(X_train,y_train)

In [None]:
train_score_abc = abc.score(X_train,y_train)

In [None]:
test_score_abc = abc.score(X_val,y_val)

In [None]:
ytest_pred_abc = abc.predict(X_val)

In [None]:
conf_mat_abc =confusion_matrix(y_val,ytest_pred_abc)
plot_confusion_matrix(conf_mat_abc,class_names=names,figsize=(12,5))

In [None]:
print(f"Train Accuracy: {train_score_abc*100:.3f}%")
print(f"Test Accuracy: {test_score_abc*100:.3f}%")

# KNN 

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()

In [None]:
knn.fit(X_train, y_train)

In [None]:
ytrain_pred_knn = knn.predict(X_train)

In [None]:
ytest_pred_knn = knn.predict(X_val)

In [None]:
ytrain_score_knn = knn.score(X_train,y_train)


In [None]:
ytest_score_knn = knn.score(X_val,y_val)


In [None]:
conf_mat_knn =confusion_matrix(y_val,ytest_pred_knn)

In [None]:
plot_confusion_matrix(conf_mat_knn,class_names=names,figsize=(12,5))

In [None]:
print(f"Train Accuracy: {ytrain_score_knn*100:.3f}%")
print(f"Test Accuracy: {ytest_score_knn*100:.3f}%")

# test script

In [14]:
import os
import string
import numpy as np
import pandas as pd
import re
import nltk
import spacy
from spacy import displacy
from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA, LEMMA
from spacy.tokens import Doc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics
import pickle
nlp = spacy.load('en_core_web_lg')

In [9]:
root_folder = r'/kaggle/input/nlp-project-fcis-23/20news-bydate/20news-bydate-test'
sub_folders = os.listdir(root_folder)
len(sub_folders)

20

In [3]:
def read_data(root_folder, sub_folders):

    data, labels = [], []
    
    for folder in sub_folders:
        path = os.path.join(root_folder, folder)
        samples = os.listdir(path)

        for sample in samples:
            sample_path = os.path.join(path, sample)
            text = open(sample_path, encoding='iso-8859-1')   # 'latin-1')
            data.append(text.read())
            labels.append(folder)
    
    return data, labels
def remove_stopwords(doc):
    indexes = []
    for index, token in enumerate(doc):
        if token.is_stop:
            indexes.append(index)
    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA, LEMMA])
    np_array = np.delete(np_array, indexes, axis = 0)
    doc2 = Doc(doc.vocab, words=[t.text for i, t in enumerate(doc) if i not in indexes])
    doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA, LEMMA], np_array)
    return doc2
def clean_header(text):
    text = re.sub(r'(From:\s+[^\n]+\n)', '', text)
    text = re.sub(r'(Subject:[^\n]+\n)', '', text)
    text = re.sub(r'(([\sA-Za-z0-9\-]+)?[A|a]rchive-name:[^\n]+\n)', '', text)
    text = re.sub(r'(Last-modified:[^\n]+\n)', '', text)
    text = re.sub(r'(Version:[^\n]+\n)', '', text)

    return text
def clean_text(text):        
    text = text.lower()
    text = text.strip()
    text = re.sub(re_url, '', text)
    text = re.sub(re_email, '', text)
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub(r'(\d+)', ' ', text)
    text = re.sub(r'(\s+)', ' ', text)
    
    return text


In [4]:
data, labels = read_data(root_folder, sub_folders)
re_url = re.compile(r'(?:http|ftp|https)://(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?')
re_email = re.compile('(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])')                 

for i in range(len(data)):
    data[i] = clean_text(clean_header(data[i]))

In [5]:
Docs = []
for i in range(len(data)):
    Docs.append(remove_stopwords(nlp(data[i])))
    if i%1000 == 0:
        print(i)

0
1000
2000
3000
4000
5000
6000
7000


In [6]:
final_data = []
for i in range(len(Docs)):
    lemmas_ = ' '.join(token.lemma_ for token in Docs[i])
    final_data.append(lemmas_)
    if i%1000 == 0:
        print(i)

0
1000
2000
3000
4000
5000
6000
7000


In [10]:
vectorizer = TfidfVectorizer(analyzer='word', max_features=5000)
vectors = vectorizer.fit_transform(final_data)

In [23]:
names = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
       'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
       'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles',
       'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
       'sci.electronics', 'sci.med', 'sci.space',
       'soc.religion.christian', 'talk.politics.guns',
       'talk.politics.mideast', 'talk.politics.misc',
       'talk.religion.misc']

In [19]:
bestmodel = pickle.load( open('/kaggle/input/xgb-nlp-model/XGBClassifierModel.pkl', 'rb'))

In [20]:
bestmodel.predict(vectors)

array([16,  7,  6, ..., 12,  6, 12])