In [134]:
import gensim
import numpy as np
import pandas as pd
import math
import gensim.downloader as api

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

### Loading Dataset

In [135]:
df_train=pd.read_csv("BBC News Train.csv")


### Mapping

In [136]:
Category_class=sorted(df_train["Category"].unique())
Category_class

['business', 'entertainment', 'politics', 'sport', 'tech']

In [137]:
mapping={'business':0, 'entertainment':1, 'politics':2, 'sport':3, 'tech':4}
df_train['CategoryId']=df_train['Category'].map(mapping)


In [138]:
df_train.groupby('Category').CategoryId.count()

Category
business         336
entertainment    273
politics         274
sport            346
tech             261
Name: CategoryId, dtype: int64

### PreProcess Text

In [139]:
def ProcessText(text):
    text=str(text)
    #lowercasing
    text=text.lower()
    #Remove Stop Words
    stop_words=set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_list = [w for w in word_tokens if not w in stop_words]
    
    
    #Remove numbers and special Symbols
    #words like 100m 2m were not removed so using this
    num=['0','1','2','3','4','5','6','7','8','9']
    num_filter=[]
    for i in range(0,len(filtered_list)):
        for j in range(0,len(num)):
            if num[j] in filtered_list[i]:
                num_filter.append(filtered_list[i])
                break
    
    for filter in num_filter:
        filtered_list.remove(filter)
                
    filtered_list = [w for w in filtered_list if w.isalnum()]
    filtered_list=  [w for w in filtered_list if not w.isdigit()]
    
    
    
    #Lematizing
    wordnet_lemmatizer=WordNetLemmatizer()
    lemmatized_list=[wordnet_lemmatizer.lemmatize(w,wordnet.VERB) for w in filtered_list]
    #lemmatized_string=' '.join(lemmatized_list)
    
    return lemmatized_list

### Apply Processed Text in our column

In [140]:
df_train['Text_Processed'] = df_train['Text'].apply(ProcessText)
df_train['text_clean'] = df_train['Text'].apply(lambda x: gensim.utils.simple_preprocess(x))


### Word2Vec Model

In [141]:
from gensim.models.keyedvectors import KeyedVectors as wv

In [142]:
wv = api.load('word2vec-google-news-300')



In [143]:
X_train, X_test, y_train,y_test = train_test_split(df_train['Text_Processed'],df_train['Category'],test_size=0.2)

In [144]:
w2v = gensim.models.Word2Vec(X_train , vector_size = 100 , window = 5 , min_count = 2)


In [145]:
words = set(wv.index_to_key )
X_train_vect = np.array([np.array([wv[i] for i in ls if i in words])
                         for ls in X_train],dtype=object)
X_test_vect = np.array([np.array([wv[i] for i in ls if i in words])
                         for ls in X_test],dtype=object)

In [146]:
for i,v in enumerate(X_train_vect):
    print(len(X_train.iloc[i]),len(v))
    if i>10:
        break

75 67
183 178
141 132
241 237
97 96
241 233
164 156
113 108
337 321
169 156
218 195
138 138


### Compute sentence vectors by averaging the word vectors for the words contained in sentence

In [147]:

X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(300, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(300, dtype=float))

### Are our sentence vector lengths consistent?

In [148]:
for i,v in enumerate(X_train_vect_avg):
    print(len(X_train.iloc[i]),len(v))
    if i > 10:
        break

75 300
183 300
141 300
241 300
97 300
241 300
164 300
113 300
337 300
169 300
218 300
138 300


#### Hyper Tuning

In [149]:
svc=SVC()
knn=KNeighborsClassifier()
dtc=DecisionTreeClassifier()
mb=MultinomialNB()
rtc=RandomForestClassifier()

In [150]:
models={'svc':svc,'knn':knn,'dtc':dtc,'rtc':rtc}


In [151]:
params={
    'knn': 
        {'n_neighbors':[3,5,7,9,11,13,15],
        'metric':['cosine','euclidean','manhattan'],
        'weights':['uniform','distance']},
    
    'svc': {'C':[0.1,1,10,100], 
            'gamma':[1,0.1,0.01,0.01], 
            'kernel':['rbf','linear']},
    
    'dtc':{
        'criterion':['gini','entropy'],
        'max_depth':[2,4,6,8,10,12]
    },
    
    'mb':{
        'alpha': [1.0,2.0],
    'fit_prior': [True]
    },
    'rtc':{
        'criterion':['gini','entropy'],
        'max_depth':[2,4,6,8,10,12]
    }
}

In [152]:
model_accuracy={}
score=0.0001
for model in models.keys():
    mod = GridSearchCV(
    models[model],
    params[model],
    verbose=0, #Progress bar showing
    cv=5, #cross validation
    n_jobs=-1, #cores to assign
)
    gridsearch_result=mod.fit(X_train_vect_avg,y_train.values.ravel())
    
    #selects best model
    if(score < float(gridsearch_result.score(X_test_vect_avg,y_test))):
            score=gridsearch_result.score(X_test_vect_avg,y_test)
            best_model=gridsearch_result
            
    predict=mod.predict(X_test_vect_avg)
    print(f"{model} : ",gridsearch_result.best_estimator_)
    if model not in model_accuracy.keys():
        model_accuracy.update({model:accuracy_score(y_test,predict)})

svc :  SVC(C=10, gamma=1, kernel='linear')
knn :  KNeighborsClassifier(metric='cosine', weights='distance')
dtc :  DecisionTreeClassifier(criterion='entropy', max_depth=12)
rtc :  RandomForestClassifier(max_depth=10)


In [153]:
model_accuracy

{'svc': 0.9731543624161074,
 'knn': 0.9530201342281879,
 'dtc': 0.8120805369127517,
 'rtc': 0.9496644295302014}

In [154]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
# rf = AdaBoostClassifier(n_estimators=100, learning_rate=0.5, random_state=0)
rf = GradientBoostingClassifier(n_estimators=100, learning_rate=.5, max_depth=1, random_state=0)

#rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg,y_train.values.ravel())

In [155]:
y_pred = rf_model.predict(X_test_vect_avg)
y_pred


array(['tech', 'politics', 'tech', 'sport', 'tech', 'sport', 'sport',
       'entertainment', 'politics', 'sport', 'tech', 'tech',
       'entertainment', 'tech', 'tech', 'business', 'tech', 'tech',
       'sport', 'politics', 'tech', 'business', 'entertainment',
       'business', 'tech', 'business', 'entertainment', 'sport',
       'politics', 'sport', 'politics', 'politics', 'politics', 'sport',
       'entertainment', 'tech', 'politics', 'entertainment', 'tech',
       'entertainment', 'tech', 'entertainment', 'tech', 'politics',
       'business', 'politics', 'business', 'business', 'tech', 'sport',
       'sport', 'sport', 'sport', 'politics', 'entertainment',
       'entertainment', 'business', 'sport', 'entertainment',
       'entertainment', 'entertainment', 'sport', 'business',
       'entertainment', 'business', 'tech', 'business', 'business',
       'entertainment', 'business', 'business', 'tech', 'politics',
       'business', 'tech', 'sport', 'business', 'tech', 'tech', '

In [156]:
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test, y_pred,average="weighted")
recall = recall_score(y_test, y_pred,average="weighted")
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.947 / Recall: 0.946 / Accuracy: 0.946


In [172]:
text = "A secretariat meeting of the party held on Monday morning decided to quit the Pushpa Kamal Dahal-led government and also withdraw its support to the government. The decision by the party comes in the wake of a changed political equation in the run-up to the presidential election. UML had earlier decided to wait till the presidential election scheduled for March 9 before taking a final call on whether to leave the government. Bishnu Paudel, UML’s vice-chairman, said the meeting decided to pull out of the government after Prime Minister Pushpa Kamal Dahal ‘started working in a different fashion’. Prime minister directed the foreign minister to cancel her foreign trip. It did not seem appropriate for us to stay in the government after he asked to leave the government voluntarily or work as a minister without portfolio,” said Paudel talking to journalists after the meeting.The party, riled by the prime minister’s direction to Minister for Foreign Affairs Bimala Rai Paudyal to cancel her Geneva trip, had summoned the secretariat meeting. "
text = ProcessText(text)
text_vect = np.array([np.array([wv[i] for i in text if i in words])])

text_vect_avg = []
for v in text_vect:
    if v.size:
        text_vect_avg.append(v.mean(axis=0))
    else:
        text_vect_avg.append(np.zeros(300, dtype=float))
rf_model.predict(text_vect_avg)


array(['politics'], dtype=object)

In [173]:
text = "Nepal has been confirmed as the host nation of the first round of the Women's Olympic Football Asian Qualifiers for the Paris Olympics in 2024, ANFA said on its website on Sunday. Nepal have been drawn alongside Vietnam, Palestine and Afghanistan in Group C. Palestine will not travel to Nepal after they withdrew from the tournament. “AFC confirmed Nepal as the host nation for Group C,” ANFA added. The first round matches will be held from April 3 and 11. Seven group winners from the first round will join North Korea, Australia, China, Japan and South Korea—who received bye as the five-highest ranked teams—in the second round. The 12 teams will be drawn into three groups in the second round, with the winner of each group and one best runner-up advancing to the third round. The top two teams from the third round will secure their berths in the Paris Olympics 2024."

text = ProcessText(text)
text_vect = np.array([np.array([wv[i] for i in text if i in words])])

text_vect_avg = []
for v in text_vect:
    if v.size:
        text_vect_avg.append(v.mean(axis=0))
    else:
        text_vect_avg.append(np.zeros(300, dtype=float))
rf_model.predict(text_vect_avg)


array(['sport'], dtype=object)