<a href="https://colab.research.google.com/github/Hristy26/news-articles/blob/main/news_articles_preprocessing_galvan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
#Import our dependencies
!pip install gradio
!pip install sentence-transformers==2.2.2
import pandas as pd
import numpy as np
import gradio as gr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.datasets import make_blobs
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sentence_transformers import SentenceTransformer
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt

#Models to use in our pipeline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier




In [13]:
#Import and read news articles
articles_df = pd.read_csv("news_articles.csv")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
articles_df.head()

In [None]:
articles_df.info()

In [None]:
articles_df.nunique()

In [None]:
articles_df["label"].value_counts()

In [None]:
# Convert the "title" column from the news articles DataFrame to a list.
title_list = articles_df["title"].tolist()
title_list

In [None]:
# Convert the "text" column from the news articles DataFrame to a list.
text_list = articles_df["text"].tolist()
text_list

In [None]:
# Convert the "title_without_stopwords" column from the news articles DataFrame to a list.
title_without_stopwords_list = articles_df["title_without_stopwords"].tolist()
title_without_stopwords_list

In [None]:
# Convert the "text_without_stopwords" column from the news articles DataFrame to a list.
text_without_stopwords_list = articles_df["text_without_stopwords"].tolist()
text_without_stopwords_list

In [None]:
#Create an instance of the label encoder
le = LabelEncoder()

#Copy datafram
encoded_articles_df = articles_df.copy()
LabelEncoder().fit_transform

# Fit and transform the label encoder for each column
for column in encoded_articles_df:
    encoded_articles_df[column] = le.fit_transform(encoded_articles_df[column])

encoded_articles_df.head()

In [None]:
encoded_articles_df.info()

In [None]:
encoded_articles_df.shape

In [None]:
encoded_articles_df["label"].value_counts()

In [None]:
encoded_articles_df["label"].unique()

In [None]:
y_encoded_df = encoded_articles_df["label"]
y_encoded_df.head()

In [None]:
X_encoded_df = encoded_articles_df.drop(["label"], axis = 1)
X_encoded_df.head()

In [None]:
#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded_df, y_encoded_df, random_state=1)

In [None]:
#Create the model
model = LogisticRegression()

In [None]:
#Fit the model to the training data
model.fit(X_train, y_train)

In [None]:
# Calculate the mean_squared_error and the r-squared value
# for the testing data

# Use our model to make predictions
predicted = model.predict(X_test)

# Score the predictions with mse and r2
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)
rmse = np.sqrt(mean_squared_error(y_test, predicted))

print(f"mean squared error (MSE): {mse}")
print(f"R-squared (R2): {r2}")
print(f"Root mean squarted error (RMSE): {rmse}")

In [None]:
# Call the `score()` method on the model to show the R2 score
model.score(X_test, y_test)

In [None]:
def text_classification(articles_df):

    # Set the features variable to the title message column.
    articles_df = articles_df.dropna()
    X = articles_df['text']

    # Set the target variable to the "label" column.
    y = articles_df['label']

    # Split data into training and testing and set the test_size = 33%
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    print(y_train.info())

    # Build a pipeline to transform the test set to compare to the training set.
    text_classification = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                     ('clf', LinearSVC()),
])

    # Fit the model to the transformed training data and return model.

    model = text_classification.fit(X_train, y_train)
    return model

In [None]:
# Call the title_classification function with the DataFrame and set the result to the "title_clf" variable
text_classification = text_classification(articles_df)
text_classification

In [None]:
# Create a function called `text_prediction` that takes in the text and predicts the whether the text is "fake" or "real".
# The function should return the Text, and say whether the text is "fake" or "real".
def text_prediction(text):

    # Create a variable that will hold the prediction of a new text.
    text_predictions = text_classification.predict([text])

    # Using a conditional if the prediction is "real" return the message:
    # f'The text message: "{text}", is fake.' Else, return f'The text message: "{text}", is real.'

    if text_predictions[0] == 'real':
        return f'The text: "{text}", is real.'
    else:
        return f'The text: "{text}", is fake.'

In [None]:
articles_df['label'].value_counts()

In [None]:
# Create a title_app that takes a textbox for the inputs and has a textbox for the output.
# Povide labels for each textbox.

app = gr.Interface(
        fn=text_prediction,
inputs = [
gr.Textbox(label="What is the text you want to test?")],
outputs=gr.Textbox(lines=10, label="Our app has determined: ", show_copy_button=True))


# Launch the app.
#app.launch(show_error=True)

app.launch(share=True)

In [None]:
# Create the random forest classifier model


#randomforest_model = RandomForestClassifier(n_estimators=128, random_state=1)
randomforest_model = RandomForestClassifier(max_depth=5)

In [None]:
# Fit the model to the training data
randomforest_model.fit(X_train, y_train)

In [None]:
# Validate the model by checking the model accuracy with model.score
print(f"Training Data Score: {randomforest_model.score(X_train, y_train)}")
print(f"Testing Data Score: {randomforest_model.score(X_test, y_test)}")

In [None]:
# Make predictions and produce the classification report for the randome forest model
predictions = randomforest_model.predict(X_test)
print(classification_report(y_test, predictions))

In [None]:
articles_df.dtypes

In [None]:
def read_process(articles_df, features, target):

    # Drop missing values
    articles_df = articles_df.dropna()
    X = articles_df[features]
    y = articles_df[target]

    # Check for categorical variables
    categorical_columns = X.select_dtypes(include=['object', 'category']).columns
    numerical_columns = X.select_dtypes(exclude=['object', 'category']).columns

    # Handle categorical columns variables (if needed)
    X = pd.get_dummies(X, columns=categorical_columns, dtype=float)  # Binary classification assumed here

    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    # # Return data with both numerical and categorical columns separated
    # return X, y, numerical_columns, categorical_columns
    return X, y_encoded

def model_generator(articles_df, features, target):
    X, y = read_process(articles_df, features, target)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    # Column transformer to handle scaling and encoding
    # preprocessor = ColumnTransformer(
    #     transformers=[
    #         ('num', StandardScaler(), numerical_columns),
    #         ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    #     ]
    # )


    models = {
        "Logistic Regression": LogisticRegression(),
        "SVR": SVR(),
        "Random Forest": RandomForestClassifier(),
        "Gradient Boosting": GradientBoostingClassifier(),
        "Decision Tree": DecisionTreeClassifier()
    }

    results = {}



    for name, model in models.items():
        pipeline = Pipeline([
            ("scale", StandardScaler()),  # Apply preprocessor to handle encoding and scaling
            (name, model)
        ])

        pipeline.fit(X_train, y_train)
        y_predictions = pipeline.predict(X_test).reshape(-1, 1)
        score = pipeline.score(X_test, y_test)
        results[name] = score
        print(f"{name} Score: {score}")

    return results


In [None]:
features = articles_df.drop(["label", "type"], axis=1).columns.to_list()
target = "label"

In [None]:
articles_df['label'].value_counts()

In [None]:
read_process(articles_df, features, target)

In [None]:
#Generate accuracy score by model
model_generator(articles_df, features, target)

OPTIMIZATION - HYPERPARAMETERS

In [None]:
# Create KNN classifier
from sklearn.neighbors import KNeighborsClassifier
random_tuned_model = KNeighborsClassifier()

In [None]:
# Create the parameter object for the randomized search estimator.
# Try adjusting n_neighbors with values of 1 through 19.
# Adjust leaf_size by using a range from 1 to 500.
# Include both uniform and distance options for weights.

param_grid = {
     'n_neighbors': np.arange(1,20,2),
     'weights': ['uniform', 'distance'],
     'leaf_size': np.arange(1, 500)
 }
param_grid


In [None]:
# Create the randomized search estimator
from sklearn.model_selection import RandomizedSearchCV
random_clf = RandomizedSearchCV(random_tuned_model, param_grid, random_state=0, verbose=3)

In [None]:
# Fit the model by using the randomized search estimator.
random_clf.fit(X_train, y_train)

In [None]:
# List the best parameters for this dataset
print(random_clf.best_params_)

In [None]:
# Print the classification report for the best model
grid_y_pred = random_clf.predict(X_test)
print(classification_report(y_test, grid_y_pred))

In [None]:
# Make predictions with the hypertuned model
random_tuned_pred = random_clf.predict(X_test)

In [None]:
# Calculate the classification report
print(classification_report(y_test, random_tuned_pred))