This file consists of 7600 testing samples of news articles that contain 3 columns. The first column is Class Id, the second column is Title and the third column is Description. The class ids are numbered 1-4 where 1 represents World, 2 represents Sports, 3 represents Business and 4 represents Sci/Tech.

#1- LOAD DATASET

In [13]:
import pandas as pd
train_df = pd.read_csv('/content/train.csv', encoding='utf-8')
test_df = pd.read_csv('/content/test.csv',encoding= 'utf-8')
train_df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [14]:
train_df['Title'][0]

'Wall St. Bears Claw Back Into the Black (Reuters)'

In [15]:
test_df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...
1,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
3,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
4,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...


#2 - PREPROCESSİNG

In [16]:
print(train_df['Title'].isnull().sum())
print(train_df['Description'].isnull().sum())

0
0


In [17]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  # Remove single characters
    text = re.sub(r'\s+', ' ', text, flags=re.I)  # Remove multiple spaces
    text = text.lower()  # Convert to lowercase
    text = text.split()  # Split into words
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text if word not in stopwords.words('english')]
    return ' '.join(text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
train_df['cleaned_title'] = train_df['Title'].apply(clean_text)
train_df['cleaned_description'] = train_df['Description'].apply(clean_text)
test_df['cleaned_title'] = test_df['Title'].apply(clean_text)
test_df['cleaned_description'] = test_df['Description'].apply(clean_text)

In [19]:
train_df.head()

Unnamed: 0,Class Index,Title,Description,cleaned_title,cleaned_description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",wall st bear claw back black reuters,reuters short seller wall street dwindling ban...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,carlyle look toward commercial aerospace reuters,reuters private investment firm carlyle group ...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,oil economy cloud stock outlook reuters,reuters soaring crude price plus worry economy...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,iraq halt oil export main southern pipeline re...,reuters authority halted oil export flow main ...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...",oil price soar time record posing new menace u...,afp tearaway world oil price toppling record s...


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

vectorizer_title = TfidfVectorizer(max_features=5000)
vectorizer_desc = TfidfVectorizer(max_features=5000)


X_title = vectorizer_title.fit_transform(train_df['cleaned_title'])
X_description = vectorizer_desc.fit_transform(train_df['cleaned_description'])

X_title_test = vectorizer_title.transform(test_df['cleaned_title'])
X_description_test = vectorizer_desc.transform(test_df['cleaned_description'])

X_combined = hstack([X_title,X_description])
X_combined_test = hstack([X_title_test,X_description_test])

In [21]:
y_train = train_df['Class Index']
y_test = test_df['Class Index']

#3-Building Models And Evaluate

In [23]:
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,BaggingClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score,classification_report


# Create İnstant

rf_model = RandomForestClassifier()
#ada_model = AdaBoostClassifier()
#bag_model = BaggingClassifier()
gb_model = GradientBoostingClassifier()
dt_model = DecisionTreeClassifier()
knn_model = KNeighborsClassifier()
#xgb_model = XGBClassifier()
gnb_model = GaussianNB()
mnb_model = MultinomialNB()
svc_model = LinearSVC()


# Create models dict

clf = {
    'RandomForest':rf_model,
    #'AdaBoost': ada_model,
    #'Bagging':bag_model,
    'Gradient Boosting':gb_model,
    #'DecisionTree':dt_model,
    #'KNeighbors':knn_model,
    #'XGBoost':xgb_model,
    #'GaussianNB':gnb_model,
    'MultinomialNB':mnb_model,
    'LinearSVC':svc_model
}

def training_model(clf,X_train,X_test,y_train,y_test):
  clf.fit(X_train,y_train)
  y_pred = clf.predict(X_test)

  acc = accuracy_score(y_test,y_pred)
  report = classification_report(y_test,y_pred)

  return acc , report


#X_train,X_test,y_train,y_test = train_test_split(X_combined,y,test_size = 0.2,random_state = 68)

for name , clf in clf.items():
  acc , report = training_model(clf,X_combined,X_combined_test,y_train,y_test)
  print(f'For : {name}')
  print(f'Accuracy Score : {acc}')
  print(f'Classification Report :{report}' )
  print('*'*40)

For : RandomForest
Accuracy Score : 0.8852631578947369
Classification Report :              precision    recall  f1-score   support

           1       0.90      0.89      0.90      1900
           2       0.91      0.97      0.94      1900
           3       0.87      0.84      0.85      1900
           4       0.86      0.85      0.85      1900

    accuracy                           0.89      7600
   macro avg       0.88      0.89      0.88      7600
weighted avg       0.88      0.89      0.88      7600

****************************************
For : Gradient Boosting
Accuracy Score : 0.8231578947368421
Classification Report :              precision    recall  f1-score   support

           1       0.87      0.82      0.84      1900
           2       0.90      0.88      0.89      1900
           3       0.81      0.79      0.80      1900
           4       0.72      0.81      0.76      1900

    accuracy                           0.82      7600
   macro avg       0.83      0.82    



For : LinearSVC
Accuracy Score : 0.9093421052631578
Classification Report :              precision    recall  f1-score   support

           1       0.92      0.90      0.91      1900
           2       0.95      0.97      0.96      1900
           3       0.88      0.87      0.88      1900
           4       0.88      0.89      0.89      1900

    accuracy                           0.91      7600
   macro avg       0.91      0.91      0.91      7600
weighted avg       0.91      0.91      0.91      7600

****************************************
