In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load your dataset
df = pd.read_csv('abc.csv')

# Step 1: Preprocess the text data
df['cleaned_text'] = df['title'] + " " + df['text']  # Combine 'title' and 'text' columns for text data

# Step 2: Handle missing values (if any)
df['cleaned_text'] = df['cleaned_text'].fillna('')  # Fill missing text with an empty string

# Step 3: Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=500, stop_words='english')  # Using 500 features
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_text'])

# Step 4: Extract numerical features
X_numerical = df[['score', 'upvotes', 'comments']]  # Extract numerical features

# Step 5: Combine the text features (TF-IDF) and numerical features
X_combined = np.hstack([X_numerical, X_tfidf.toarray()])  # Combine numerical and TF-IDF features

# Step 6: Prepare the target variable (sentiment categories)
label_encoder = LabelEncoder()
df['sentiment_category_encoded'] = label_encoder.fit_transform(df['sentiment_category'])  # Encode labels
y = df['sentiment_category_encoded']  # Target variable

In [None]:
# Step 7: Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)


In [None]:
# Step 8: Train the RandomForestClassifier model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Step 9: Evaluate the model
y_pred = model.predict(X_test)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 10: Save the trained model and vectorizers (optional)
joblib.dump(model, 'stock_sentiment_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

print("Model, vectorizer, and label encoder saved.")

In [None]:
# Step 10: Save the trained model and vectorizers (optional)
joblib.dump(model, 'stock_sentiment_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

print("Model, vectorizer, and label encoder saved.")



In [None]:
# Optionally, display some predictions for inspection
predictions = label_encoder.inverse_transform(y_pred)
df_predictions = pd.DataFrame({'Actual': label_encoder.inverse_transform(y_test), 'Predicted': predictions})
print(df_predictions.head())