In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
from scipy import sparse

# Load and preprocess the data
train_dataset = pd.read_csv('train_dataset.csv')

# Basic TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=50000)
X_tfidf = vectorizer.fit_transform(train_dataset['headline'])

# Tokenization
train_dataset['tokenized_text'] = train_dataset['headline'].apply(lambda x: word_tokenize(x.lower()))

# Removing Stop Words
stop_words = set(stopwords.words('english'))
train_dataset['stopwords_removed'] = train_dataset['tokenized_text'].apply(lambda x: [word for word in x if word.lower() not in stop_words])

# Word Counts
train_dataset['word_count'] = train_dataset['tokenized_text'].apply(len)

# Character Counts
train_dataset['char_count'] = train_dataset['headline'].apply(len)

# Average Word Length
train_dataset['avg_word_length'] = train_dataset.apply(lambda row: row['char_count'] / row['word_count'], axis=1)

# Other custom features based on the text

# Combine features into a single matrix
additional_features = train_dataset[['word_count', 'char_count', 'avg_word_length']].values
additional_features = StandardScaler().fit_transform(additional_features)
X_combined = sparse.hstack((X_tfidf, sparse.csr_matrix(additional_features)))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, train_dataset['clickbait'], test_size=0.09, random_state=50)

# Train the model
classification_model = RandomForestClassifier(random_state=50)
classification_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classification_model.predict(X_test)

# Evaluate the classification model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the evaluation results
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)


[nltk_data] Downloading package punkt to /home/delish/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/delish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.9755116959064327
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.98      0.98      1372
           1       0.98      0.97      0.98      1364

    accuracy                           0.98      2736
   macro avg       0.98      0.98      0.98      2736
weighted avg       0.98      0.98      0.98      2736



In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'train_dataset' and 'test_dataset' are your training and test DataFrames

# Vectorize the training data
vectorizer = TfidfVectorizer(max_features=1000)  # Use the same parameters as used during training
X_train_tfidf = vectorizer.fit_transform(train_dataset['headline'])

# Vectorize the test data using the same vectorizer
X_test_tfidf = vectorizer.transform(test_dataset['headline'])

# Train your model using RandomForestClassifier (or any other classifier)
from sklearn.ensemble import RandomForestClassifier

classification_model = RandomForestClassifier(random_state=42)
classification_model.fit(X_train_tfidf, train_dataset['clickbait'])

# Make predictions on the test set
test_predictions = classification_model.predict_proba(X_test_tfidf)[:, 1]

# Assuming you have obtained 'test_predictions' from your model

threshold = 0.5  # Set your threshold value here

# Convert probabilities to binary predictions based on the threshold
binary_predictions = [1 if prob >= threshold else 0 for prob in test_predictions]

# Create a DataFrame with ID and predicted probabilities
submission_df = pd.DataFrame({'ID': test_dataset['ID'], 'clickbait': binary_predictions})

# Save the predictions to a CSV file
submission_df.to_csv('submission03.csv', index=False)


In [5]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Generating synthetic data
X, y = make_classification(n_samples=15200, n_features=20, n_classes=2, random_state=42)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing the model and training it
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Printing the evaluation results
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)


Accuracy: 0.9105263157894737
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.92      0.91      1518
           1       0.92      0.90      0.91      1522

    accuracy                           0.91      3040
   macro avg       0.91      0.91      0.91      3040
weighted avg       0.91      0.91      0.91      3040

