### Installing required packages

In [1]:
# !pip install sentence-transformers
# !pip install fastapi uvicorn

### Importing required libraries

In [2]:
import pandas as pd
import re
import joblib
import threading
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from flask import Flask, request, jsonify


  from pandas.core import (


### Loading the dataset

In [3]:
df = pd.read_csv('articles.csv')
# I faced error while importing the dataset unicode decode error

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x92 in position 1114: invalid start byte

### Fixing issue in loading the dataset (Converting Utf-8 codec to ISO)

In [61]:
df = pd.read_csv('articles.csv', encoding='ISO-8859-1')

### Displaying the first 5 rows of the dataframe

In [62]:
df.head()

Unnamed: 0,Id,Heading,Article.Banner.Image,Outlets,Article.Description,Full_Article,Article_Type,Tonality
0,d6995462-5e87-453b-b64d-e9f1df6e94d2,"A Puzzling Maneuver, Then Freefall: NTSB Repor...",,Essex Caller,<p>The helicopter that crashed in Southeast Al...,<p>The helicopter that crashed in Southeast Al...,Commercial,Negative
1,8b05e939-a89e-4548-b92b-013822e8ee7d,Bells Nexus Air Taxi Concept Rings Changes Fo...,,Aviation Week Network,<p>A year after teasing the fledgling electric...,<p>A year after teasing the fledgling electric...,Commercial,Positive
2,69fcd400-bceb-4255-8277-619f2d68ac0b,Bell Helicopter Show Air Taxi Nexus,http://images.tmtpost.com/uploads/images/2019/...,TMTPost,<p>Bell released the full-size design of the v...,<p>Bell released the full-size design of the v...,Commercial,Positive
3,17943578-c11b-414b-b3f5-063d3a93157b,BELL DÉVOILE LA CONCEPTION INTÉGRALE DE SON TA...,http://www.fredzone.org/wp-content/uploads/201...,Fredzone,<p>Bell est une soci&eacute;t&eacute; am&eacut...,<p>Bell est une soci&eacute;t&eacute; am&eacut...,Commercial,Positive
4,f33c7b11-5f77-4a98-bb2e-d36689042aea,Les premiers retours dOlivier Ezratty,,FrenchWeb,<p>It was still anecdotal to observe the explo...,<p>It was still anecdotal to observe the explo...,Commercial,Positive


### Checking the shape of the dataset

In [63]:
df.shape

(4305, 8)

### Making a copy of the dataframe for further operations

In [64]:
df_copy = df.copy()

### Dropping the unwanted columns from the dataframe (Id, Article.Banner.Image, Outlets, Article_Type)

In [65]:
df_copy = df_copy.drop(columns=['Id', 'Article.Banner.Image', 'Outlets', 'Article_Type'])

### Checking for null values

In [66]:
df_copy.isnull().sum()

Heading                  0
Article.Description      0
Full_Article             0
Tonality               432
dtype: int64

### Dropping the rows with null values (for column 'Tonality')

In [67]:
df_copy = df_copy.dropna(subset=['Tonality'])

### Checking for null values

In [68]:
df_copy.isnull().sum()

Heading                0
Article.Description    0
Full_Article           0
Tonality               0
dtype: int64

### Cleaning the HTML tags and other special characters from columns (Heading, Article.Description & Full_Article)

In [69]:
def cleaning_text(text):
    text = re.sub(r'<.*?>','',text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s]','', text)
    return text.lower()

df_copy.loc[:,'Heading'] = df_copy['Heading'].apply(cleaning_text)
df_copy.loc[:,'Article.Description'] = df_copy['Article.Description'].apply(cleaning_text)
df_copy.loc[:,'Full_Article'] = df_copy['Full_Article'].apply(cleaning_text)

### Initializing the sentence Transformer

In [70]:
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

### Encoding 'Full_Article' text into a numerical vector representation (embedding)

In [71]:
embeddings = sentence_model.encode(df_copy['Full_Article'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/122 [00:00<?, ?it/s]

### Splitting the dataset in 70 - 30 (Train and Test)

In [72]:
X = embeddings
y = df_copy['Tonality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Initializing Random Forest Classifier

In [73]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

### Hyper parameter tuning using GridSearchCV

In [74]:
param_grid = {
    'n_estimators': [50,100,200],
    'max_depth':[None, 10, 20, 30]
}

grid_search = GridSearchCV(rfc, param_grid, cv=3)
grid_search.fit(X_train, y_train)

### Evaluating the performance of the Hyper parameter tuning

In [75]:
y_pred = grid_search.best_estimator_.predict(X_test)
cr = classification_report(y_test, y_pred)
print("Classification Report: ", cr)

Classification Report:                precision    recall  f1-score   support

    Negative       0.93      0.79      0.86        72
     Neutral       1.00      0.04      0.07        52
    Positive       0.91      1.00      0.96       651

    accuracy                           0.92       775
   macro avg       0.95      0.61      0.63       775
weighted avg       0.92      0.92      0.89       775



### Saving the model as a Pickle

In [76]:
joblib.dump(grid_search.best_estimator_, 'article_classifier.pkl')

['article_classifier.pkl']

### API endpoint using FLASK

In [81]:
app = Flask(__name__)

# Loading the model
user_model = joblib.load('article_classifier.pkl')

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    text = data['text']
    text_cleaned = cleaning_text(text)
    embeddings = sentence_model.encode([text_cleaned])
    prediction = user_model.predict(embeddings)[0]
    # Returning the final prediction response as a JSON 
    return jsonify({'prediction': prediction})

def my_app():
    app.run(debug=True, use_reloader=False)

thread = threading.Thread(target=my_app)
thread.start()

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
