In [34]:
import pandas as pd

In [35]:
import chardet

In [36]:
with open('articles.csv', 'rb') as f:
    result = chardet.detect(f.read())
    encoding = result['encoding']

art = pd.read_csv('articles.csv', encoding=encoding)

In [37]:
print(art.head())

                                     Id  \
0  d6995462-5e87-453b-b64d-e9f1df6e94d2   
1  8b05e939-a89e-4548-b92b-013822e8ee7d   
2  69fcd400-bceb-4255-8277-619f2d68ac0b   
3  17943578-c11b-414b-b3f5-063d3a93157b   
4  f33c7b11-5f77-4a98-bb2e-d36689042aea   

                                             Heading  \
0  A Puzzling Maneuver, Then Freefall: NTSB Repor...   
1  Bell’s Nexus Air Taxi Concept Rings Changes Fo...   
2                Bell Helicopter Show Air Taxi Nexus   
3  BELL DÉVOILE LA CONCEPTION INTÉGRALE DE SON TA...   
4             Les premiers retours d’Olivier Ezratty   

                                Article.Banner.Image                Outlets  \
0                                                NaN           Essex Caller   
1                                                NaN  Aviation Week Network   
2  http://images.tmtpost.com/uploads/images/2019/...                TMTPost   
3  http://www.fredzone.org/wp-content/uploads/201...               Fredzone   
4        

In [38]:
art.tail()

Unnamed: 0,Id,Heading,Article.Banner.Image,Outlets,Article.Description,Full_Article,Article_Type,Tonality
4300,a40e5017-0a38-4d06-bcbe-616b73456c94,"Chinook Catches Army Flirting With Younger, Th...",https://www.duffelblog.com/wp-content/uploads/...,Duffel Blog,Long rumored tensions came to a head as the CH...,<p>Long rumored tensions came to a head as the...,Military,
4301,782ba519-bcb0-4ef1-873d-713a18b04576,Lufthansa Aviation Selects Reiser Simulation H...,,Vertical,<p>In the course of upcoming investments in ne...,<p>In the course of upcoming investments in ne...,Commercial,Positive
4302,deb31e5d-15c0-4c1d-843c-ce02e9081746,This Bell Flight Drone Won’t Be Delivering Pizza,,"WFAA-TV ABC (Dallas, TX)","<p>At Bell Flight in Fort Worth, engineers are...","<p>At Bell Flight in Fort Worth, engineers are...",Military,Positive
4303,f7125b1d-a687-469b-a799-c8cb4443b1d1,Blade Offers New York Airport Transfers for $1...,https://assets.bwbx.io/images/users/iqjWHBFdfx...,Bloomberg,<p>Getting to this price point took about five...,<p>Getting to this price point took about five...,Commercial,Positive
4304,c0d27375-9fc2-43dc-b709-26a2c48c462b,US ‘Little Birds’ Flying to Lebanon,https://www.arabianaerospace.aero/media/images...,Arabian Aerospace,Lebanon is to receive a new $120 Million US mi...,<p>Lebanon is to receive a new $120 Million US...,Military,


### Preprocessing

In [39]:
import re

In [40]:
def clean_text(text):
    text = re.sub(r'\s+', ' ',text)
    text = re.sub(r'[^\W\S]', '',text)
    return text.lower()



In [None]:
art['cleaned_text'] = df['article_content'].apply(cleaned_text)

### Vectorization

In [41]:
from sentence_transformers import SenternceTransformer

OSError: [WinError 126] The specified module could not be found. Error loading "C:\Users\HARIHARASUDHAN\AppData\Roaming\Python\Python311\site-packages\torch\lib\fbgemm.dll" or one of its dependencies.

In [None]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [None]:
embeddings = model.encode(art['cleaned_text'].tolist())

### Select and Train ML Classifier Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.Pipeline import make_pipeline
from sklearn.preprocessing import StandarScaler
from sklearn.metrics import classification_report

In [None]:
#prepare Data for Training
X_train, X_test, Y_train, Y_test = train_test_split(embeddings, art['Article_type'], test_size=0.2,random_state=42)

In [None]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, Y_train)

In [None]:
#predict and evaluate
y_pred = clf.predict(X_test)
print(classificatio_report(y_test,y_pred))

### Hyper Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {'C': [0.1, 1, 10]}
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), parameters, cv=5)
grid_search.fit(X_train, Y_train)

In [None]:
print('f Best Parameters: {grid_search.best_params_}')

### Validate and Evaluate Accuracy

In [None]:
from sklearn.model_selection import cross_val_score


In [None]:
scores = croos_val_score(clf,embeddings, art['Article_type'], cv=5)

In [None]:
print(f'Cross-validation scores:{scores}')

In [None]:
print(f'Mean' cross-validation score: {scores.mean()}')

### Performance Metrics


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Assuming y_test are the true labels and y_pred are the predicted labels
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  
recall = recall_score(y_test, y_pred, average='weighted')       
f1 = f1_score(y_test, y_pred, average='weighted')                

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-Score: {f1:.2f}')

# Detailed classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

In [None]:
### Save and Reload the model

In [None]:
import joblib

# Save the model
joblib.dump(clf, 'text_classifier_model.pkl')

# Load the model
loaded_clf = joblib.load('text_classifier_model.pkl')

### Create an API Endpoint

In [None]:
from flask import Flask, request, jsonify
import joblib
from sentence_transformers import SentenceTransformer

app = Flask(__name__)

# Load model and tokenizer
model = joblib.load('text_classifier_model.pkl')
tokenizer = SentenceTransformer('paraphrase-MiniLM-L6-v2')

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    text = data['text']
    cleaned_text = clean_text(text)
    embedding = tokenizer.encode([cleaned_text])
    prediction = model.predict(embedding)
    return jsonify({'prediction': prediction[0]})

if __name__ == '__main__':
    app.run(debug=True)


### Extract Data from unknown_articles.csv

In [None]:
unknown_df = pd.read_csv('unknown_articles.csv')
# Extract headings and full articles
# Implement necessary preprocessing here



In [None]:
# Assuming preprocessing and embedding as above
new_embeddings = tokenizer.encode(unknown_df['full_article'].tolist())
predictions = model.predict(new_embeddings)

## Thank You