In [1]:
import pandas as pd 
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('train.csv')

I used a 3rd party kagglr dataset to train my news classifier model

In [3]:
df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


Consists of class ids 1-4 where 1-World, 2-Sports, 3-Business, 4-Sci/Tech

#### Model aim is for classification so merging Title and Description will take priority

In [4]:
df['text'] = df.apply(lambda x: x['Title'] + ' ' + x['Description'],axis=1)

In [5]:
df.drop(['Title','Description'],axis=1,inplace=True)
df['classes'] = df['Class Index']

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Class Index  120000 non-null  int64 
 1   text         120000 non-null  object
 2   classes      120000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 2.7+ MB


In [7]:
df['classes'].value_counts(normalize=True)*100

3    25.0
4    25.0
2    25.0
1    25.0
Name: classes, dtype: float64

Classes are distributed in exact proportions

## Text Cleaning & Preprocessing


##### I am removing the special numbers as well because we want a classification using count vectirization, So textual data is enough to classify it


In [8]:
def text_cleaning(text):
    text = text.lower().strip()
    
    pattern = re.compile('\W')
    text = re.sub(pattern, ' ', text).strip()

    pattern = re.compile(r'\d+')
    text = re.sub(pattern, '', text)
    
    text = re.sub(r'\s+', ' ', text).strip() # Removing extra whitespace
    
    return text

df['text'] = df['text'].apply(text_cleaning)

Stopwords and Stemming

In [9]:
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

def stopword_stemming(text):
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(word) for word in tokens if word.lower() not in stop_words]
    return ' '.join(tokens)

df['text'] = df['text'].apply(stopword_stemming)

#### Count Vectorizer Instance

In [10]:
cv = CountVectorizer(max_features = 10000)
X_train = cv.fit_transform(df['text']).toarray()

In [11]:
y_train = df['classes'].tolist()

##### Loading the test datset

In [12]:
df_test = pd.read_csv('test.csv')

In [13]:
df_test['text'] = df_test.apply(lambda x: x['Title'] + ' ' + x['Description'],axis=1)
df_test.drop(['Title','Description'],axis=1,inplace=True)
df_test['classes'] = df_test['Class Index']
df_test['text'] = df_test['text'].apply(text_cleaning)
df_test['text'] = df_test['text'].apply(stopword_stemming)
X_test = cv.transform(df_test['text']).toarray()
y_test = df_test['classes'].tolist()

### RandoM Forest Classification 

In [16]:
RF = RandomForestClassifier(criterion = "entropy", n_estimators = 300)

RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)
print(classification_report(y_test, y_pred))

--------------------------------------- RandomForest --------------------------------------
              precision    recall  f1-score   support

           1       0.91      0.89      0.90      1900
           2       0.92      0.97      0.95      1900
           3       0.87      0.85      0.86      1900
           4       0.87      0.86      0.87      1900

    accuracy                           0.89      7600
   macro avg       0.89      0.89      0.89      7600
weighted avg       0.89      0.89      0.89      7600



##### RF model has an accuracy of 89%

In [18]:
with open('RF.pkl','wb') as file:
    pickle.dump(RF,file)