In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import pickle
import string
import nltk
import re

In [54]:
# Download nltk stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [55]:
# Load health claims dataset to a pandas DataFrame
health_claims = pd.read_csv('/content/claims')

In [56]:
health_claims.shape #not necessary

(25096, 9)

In [57]:
# Print first 4 rows of the dataset
print(health_claims.head(4))

     id name                                          statement  \
0  6771  NaN  Dawn dish soap contains ammonia even though it...   
1   799  NaN  Do probiotics help with Irritable Bowel Syndrome?   
2  2468  NaN                 Is blue light harmful to our eyes?   
3  2634  NaN               Does caffeine help asthma sufferers?   

                                         description           category  \
0  Could Mixing Dawn Dish Soap with Clorox Bleach...  {medical,science}   
1                                                NaN          {medical}   
2  <p>It seems reasonable to reduce exposure to b...          {medical}   
3  <p><a href="https://www.nytimes.com/2010/11/30...          {medical}   

   rating queries                     created_at  \
0       0     NaN  2020-06-28 21:40:18.799231+02   
1       1     NaN  2019-12-13 14:27:18.231183+01   
2       1     NaN  2019-12-13 14:31:00.722277+01   
3       1     NaN  2019-12-13 14:31:25.271774+01   

                      updat

In [58]:
#counting missing values in dataset
health_claims.isnull().sum() #not necessary

id                 0
name           24965
statement          0
description    23407
category           0
rating             0
queries        24965
created_at     22729
updated_at     22729
dtype: int64

In [59]:
# Replace null values with empty string
health_claims = health_claims.fillna('')

In [60]:
# Placing the statement column columns into 'contents' column
health_claims['contents'] = health_claims['statement']

In [61]:
# Separate data and label
X = health_claims.drop(columns='rating', axis=1)
Y = health_claims['rating']

In [62]:
# Define word optimization function to clean text data
def clean_text(text):
    claim = text.lower()
    claim = re.sub('\[.*?\]', '', text)
    claim = re.sub("\\W"," ",text) 
    claim = re.sub('https?://\S+|www\.\S+', '', text)
    claim = re.sub('<.*?>+', '', text)
    claim = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    claim = re.sub('\n', '', text)
    claim = re.sub('\w*\d\w*', '', text)    
    return claim

In [63]:
# Clean 'contents' column using the clean_text function
health_claims['contents'] = health_claims['contents'].apply(clean_text)

In [64]:
X = health_claims['contents'].values
Y = health_claims['rating'].values

In [65]:
# Convert text data to numerical data using TfidfVectorizer
vector = TfidfVectorizer(stop_words='english', max_df=0.7)
vector.fit(X)
X = vector.transform(X)

In [66]:
# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.2, random_state=2)

In [67]:
#Passive Aggressive Classifier

In [68]:
# Train the Passive Aggressive Classifier model
model = PassiveAggressiveClassifier(max_iter=100)
model.fit(x_train, y_train)



In [69]:
# Evaluate the model on the training set
model_score = model.score(x_train, y_train)
print("Model score on training set: ", model_score)

Model score on training set:  0.9894899382347081


In [70]:
def classify_health_claims(news):
    input_data = [news]
    vectorized_data = vector.transform(input_data)
    prediction = model.predict(vectorized_data)
    return prediction

In [71]:
# Test the function with an input news
news = "Acupuncture Best for Hot Flashes in Breast Cancer Survivors"
prediction = classify_health_claims(news)
print("Prediction: ", prediction)

Prediction:  [1]


In [72]:
# Save the vectorizer and model as pickle files
filename_vectorizer = 'detection-vector.pkl'
filename_model = 'detection-model.pkl'
pickle.dump(vector, open(filename_vectorizer, 'wb'))
pickle.dump(model, open(filename_model, 'wb'))