### IntelliNews NLP Training Notebook

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import joblib
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to C:\Users\Muhammad
[nltk_data]     Haseeb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Muhammad
[nltk_data]     Haseeb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Load dataset
df = pd.read_csv('all_data.csv', encoding='ISO-8859-1', header=None)
df.head(5)

Unnamed: 0,0,1
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [5]:
df.tail()

Unnamed: 0,0,1
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...
4845,negative,Sales in Finland decreased by 10.5 % in Januar...


In [6]:
df.describe()

Unnamed: 0,0,1
count,4846,4846
unique,3,4838
top,neutral,Ahlstrom 's share is quoted on the NASDAQ OMX ...
freq,2879,2


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4846 entries, 0 to 4845
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       4846 non-null   object
 1   1       4846 non-null   object
dtypes: object(2)
memory usage: 75.8+ KB


In [8]:
df.columns

Index([0, 1], dtype='int64')

In [9]:
df.shape

(4846, 2)

In [10]:
df.isnull().sum()

0    0
1    0
dtype: int64

In [11]:
df.duplicated().sum()

np.int64(6)

In [12]:
df.shape

(4846, 2)

In [14]:
df.columns = ['Sentiment', 'Text']

In [15]:
# NLP preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [stemmer.stem(word) for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(filtered_tokens)

df["ProcessedText"] = df["Text"].apply(preprocess_text)


In [16]:
label_encoder = LabelEncoder()
df["SentimentEncoded"] = label_encoder.fit_transform(df["Sentiment"])


In [17]:
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df["ProcessedText"])
y = df["SentimentEncoded"]


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [19]:
model = XGBClassifier(n_estimators=100, max_depth=4, use_label_encoder=False, eval_metric="mlogloss")
model.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [17]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


              precision    recall  f1-score   support

    negative       0.92      0.74      0.82       176
     neutral       0.84      0.92      0.88       211
    positive       0.82      0.86      0.84       227

    accuracy                           0.85       614
   macro avg       0.86      0.84      0.85       614
weighted avg       0.86      0.85      0.85       614



In [20]:
joblib.dump(model, "xgboost_sentiment_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']

In [21]:
def predict_sentiment(text):
    vec = vectorizer.transform([text])
    pred = model.predict(vec)
    print("Raw prediction:", pred)
    return label_encoder.inverse_transform(pred)[0]

In [22]:
# Sample Tests
print(predict_sentiment("Stocks rose sharply as earnings beat expectations."))
print(predict_sentiment("Company faces investigation for accounting fraud."))
print(predict_sentiment("The report was published on Tuesday."))

Raw prediction: [2]
positive
Raw prediction: [1]
neutral
Raw prediction: [1]
neutral
