In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load train dataset
train_dataset = pd.read_csv('train_dataset.csv')

# Load test dataset
test_dataset = pd.read_csv('test_dataset.csv')

# TF-IDF vectorization for text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(train_dataset['headline'])
X_test_tfidf = vectorizer.transform(test_dataset['headline'])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_tfidf, train_dataset['clickbait'], test_size=0.2, random_state=42)

# Initialize and train the classification model
classification_model = RandomForestClassifier(random_state=42)
classification_model.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = classification_model.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, val_predictions)
report = classification_report(y_val, val_predictions)

# Print evaluation results
print(f"Validation Accuracy: {accuracy}")
print("Classification Report:\n", report)

# Make predictions on the test set
test_predictions = classification_model.predict(X_test_tfidf)

# Prepare submission file
submission_df = pd.DataFrame({'ID': test_dataset['ID'], 'clickbait': test_predictions})
submission_df.to_csv('submission222.csv', index=False)


Validation Accuracy: 0.9623355263157894
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.96      3055
           1       0.96      0.97      0.96      3025

    accuracy                           0.96      6080
   macro avg       0.96      0.96      0.96      6080
weighted avg       0.96      0.96      0.96      6080



In [5]:
# Importing DataScience libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm,skew
from scipy import stats
%matplotlib inline
from sklearn.linear_model import LinearRegression,Ridge,RidgeCV, ElasticNetCV, LassoCV,BayesianRidge
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler,StandardScaler,MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error


In [6]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for RandomForestClassifier
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    # Add other parameters to tune
}

# Perform Grid Search
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train_tfidf, y_train)

# Check the best parameters
print(grid_search.best_params_)

# Get the best model
best_rf_model = grid_search.best_estimator_


NameError: name 'RandomForestClassifier' is not defined

In [7]:

# Load train_dataset.csv and test_dataset.csv
train_dataset = pd.read_csv('train_dataset.csv')
test_dataset = pd.read_csv('test_dataset.csv')

In [8]:
train_dataset.head()

Unnamed: 0,headline,clickbait,ID
0,"Hussein enters ""not guilty"" plea at trial",0,84698cc7-8ae2-4ea3-a425-b7091561cee6
1,Iraq peace talks draw to a close in Finland,0,a4e35ca4-15fa-43e8-b68e-91457b23afee
2,British Premier Visits Northern Ireland,0,162991ee-ea2f-41ad-a753-649a68f54311
3,The Decline of Left-Handed First Basemen,0,2cd3aa32-6ec2-4af1-bd1d-560709066b8b
4,Who Said It: Donald Trump Or Kanye West,1,72553370-c348-4603-882b-39e04b610c39


In [9]:
test_dataset.head()

Unnamed: 0,ID,headline
0,5f99b099-c4db-4a02-9753-28c5e94a6b34,Israeli military launches airstrikes into Gaza...
1,3c413552-32c0-4000-a745-b4217fe427ca,Expelled' fair use upheld
2,71060e3b-bab0-4218-b1ce-8284ae46f6c3,31 Times Frankie Boyle's Twitter Was Out Of Co...
3,f0a03121-600f-4b69-b6db-989d0f3cf28a,What Does Your Zodiac Sign Say About Your Love...
4,456f7cfa-bdfe-45bd-9e88-7c4ae53eb4ba,Larson B ice-shelf collapse reveals exotic org...


In [10]:


# The rows and columns of our dataset 
train_dataset.shape



(30400, 3)

In [11]:
# Well we have to deal with plenty of attributes 
train_dataset.columns

Index(['headline', 'clickbait', 'ID'], dtype='object')

In [13]:
# Datatype of each attribute
test_dataset.dtypes


ID          object
headline    object
dtype: object

In [None]:
# Descriptive Statistics of Numerical Variables
train_dataset.describe()

In [None]:


# Statistics of our Categorical variables
train_dataset.describe(include=['O'])



In [None]:
# Checking for the Missing values
# Using isnull fuction to count the total null values in each field
total = train_dataset.isnull().sum().sort_values(ascending=False) 
# Percent of missing values is estimated by dividing total missing and the original total
percent = (train_dataset.isnull().sum()/train_dataset.isnull().count()).sort_values(ascending=False)
# Concatenating the Total and Percent fields sing pandas concat fucntion
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
# Displays top 20 from our max sorted list
missing_data.head(20)


In [None]:
# Well! Things look better now! 
train_dataset.isnull().sum().max()



In [None]:
import sys
import nltk
import sklearn
import pandas
import numpy

print('Python: {}'.format(sys.version))
print('NLTK: {}'.format(nltk.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
print('Pandas: {}'.format(pandas.__version__))
print('Numpy: {}'.format(numpy.__version__))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier  # Import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_dataset['headline'], train_dataset['clickbait'], test_size=0.2, random_state=42)

# Convert text data to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize and train the classification model
classification_model = RandomForestClassifier(random_state=42)
classification_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = classification_model.predict(X_test_tfidf)

# Evaluate the classification model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the evaluation results
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)


In [None]:
# Load the test dataset (assuming 'test_data.csv' as the file name)
test_dataset = pd.read_csv('test_dataset.csv')

# Preprocess the test data
X_test_tfidf = vectorizer.transform(test_dataset['headline'])

# Make predictions on the test set
test_predictions = classification_model.predict_proba(X_test_tfidf)[:, 1]  # Probability of being clickbait

# Assuming you have already obtained test_predictions from your model

threshold = 0.5  # Set your threshold value here

# Convert probabilities to binary predictions based on the threshold
binary_predictions = [1 if prob >= threshold else 0 for prob in test_predictions]

# Create a DataFrame with ID and predicted probabilities
submission_df = pd.DataFrame({'ID': test_dataset['ID'], 'clickbait': binary_predictions})

# Save the predictions to a CSV file
submission_df.to_csv('submission.csv', index=False)


In [None]:


# We can use sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# train the model on the training data
model.clickbait(training)

# and test on the testing dataset!
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))



In [None]:
Python: 3.10.6 (main, Mar 10 2023, 10:55:28) [GCC 11.3.0]
NLTK: 3.8.1
Scikit-learn: 1.3.0
Pandas: 1.5.3
Numpy: 1.21.5

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier  # Import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report

​

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(train_dataset['headline'], train_dataset['clickbait'], test_size=0.2, random_state=50)

​

# Convert text data to numerical features using TF-IDF

vectorizer = TfidfVectorizer(max_features=1000)

X_train_tfidf = vectorizer.fit_transform(X_train)

X_test_tfidf = vectorizer.transform(X_test)

​

# Initialize and train the classification model

classification_model = RandomForestClassifier(random_state=42)

classification_model.fit(X_train_tfidf, y_train)

​

# Make predictions on the test set

y_pred = classification_model.predict(X_test_tfidf)

​

# Evaluate the classification model

accuracy = accuracy_score(y_test, y_pred)

report = classification_report(y_test, y_pred)

​

# Print the evaluation results

print(f"Accuracy: {accuracy}")

print("Classification Report:\n", report)

​

Accuracy: 0.9629934210526315
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.96      3046
           1       0.96      0.97      0.96      3034

    accuracy                           0.96      6080
   macro avg       0.96      0.96      0.96      6080
weighted avg       0.96      0.96      0.96      6080

