In [8]:
import pandas as pd

# Load the dataset (Update the file path)
df = pd.read_csv(r'G:\hackathons\llm\llm.csv', encoding='ISO-8859-1')


# Display first few rows to check column names
print(df.head())

# Check dataset info
print(df.info())


   0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY _TheSpecialOne_  \
0  0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   scotthamilton   
1  0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY        mattycus   
2  0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY         ElleCTF   
3  0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY          Karoli   
4  0  1467811372  Mon Apr 06 22:20:00 PDT 2009  NO_QUERY        joy_wolf   

  @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  
0  is upset that he can't update his Facebook by ...                                                                   
1  @Kenichan I dived many times for the ball. Man...                                                                   
2    my whole body feels itchy and like its on fire                                                                    
3  @nationwideclass no, it's not behaving at all....           

In [9]:
print(df.columns)


Index(['0', '1467810369', 'Mon Apr 06 22:19:45 PDT 2009', 'NO_QUERY',
       '_TheSpecialOne_',
       '@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D'],
      dtype='object')


In [10]:
df.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']


In [11]:
df['text'].head()


0    is upset that he can't update his Facebook by ...
1    @Kenichan I dived many times for the ball. Man...
2      my whole body feels itchy and like its on fire 
3    @nationwideclass no, it's not behaving at all....
4                        @Kwesidei not the whole crew 
Name: text, dtype: object

In [12]:
df['cleaned_text'] = df['text'].apply(lambda x: x.lower())  # Converts text to lowercase


In [13]:
print(df['text'].isnull().sum())


0


In [14]:
df = df.dropna(subset=['text'])  # Remove rows with missing text
# OR
df['text'] = df['text'].fillna('')  # Replace NaN with an empty string


In [15]:
print(df['text'].dtype)


object


In [16]:
df['text'] = df['text'].astype(str)


In [17]:
df['cleaned_text'] = df['text'].str.lower().str.replace(r'[^a-zA-Z\s]', '', regex=True)


In [19]:
import re
import string
from tqdm import tqdm

tqdm.pandas()  # Enable tqdm progress bar for pandas

# Define the preprocessing function
def preprocess_text(text):
    if isinstance(text, str):  # Ensure input is a string
        text = text.lower()  # Convert to lowercase
        text = re.sub(r"http\S+", "", text)  # Remove URLs
        text = re.sub(r"@\w+", "", text)  # Remove mentions
        text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
        text = text.strip()  # Remove extra spaces
        return text
    return ""  # Return empty string if text is not valid

# Apply the function to the DataFrame
df["cleaned_text"] = df["text"].progress_apply(preprocess_text)


100%|██████████| 1599999/1599999 [00:08<00:00, 197252.93it/s]


In [20]:
import swifter
df["cleaned_text"] = df["cleaned_text"].swifter.apply(preprocess_text)


Pandas Apply:   0%|          | 0/1599999 [00:00<?, ?it/s]

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)  # Limit features for speed
X = vectorizer.fit_transform(df["cleaned_text"])


In [22]:
print(df.columns)


Index(['sentiment', 'id', 'date', 'query', 'user', 'text', 'cleaned_text'], dtype='object')


In [25]:
from sklearn.model_selection import train_test_split

# Ensure X is defined before using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, df["sentiment"], test_size=0.2, random_state=42)


In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Train the model on the training data
model.fit(X_train, y_train)


In [27]:
# Predict the labels on the test set
y_pred = model.predict(X_test)


In [28]:
# Print accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Print detailed classification report
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.790678125
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.78      0.79    159494
           4       0.78      0.80      0.79    160506

    accuracy                           0.79    320000
   macro avg       0.79      0.79      0.79    320000
weighted avg       0.79      0.79      0.79    320000



In [29]:
import pickle

# Save the trained Logistic Regression model using pickle
with open('logistic_regression_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the vectorizer (used for text transformation)
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)


In [31]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Sample data - Replace this with your actual data
X = ["I feel great!", "I am so sad", "This is amazing", "I am angry", "I feel hopeful"]
y = ["joy", "sadness", "joy", "anger", "hope"]

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit vectorizer
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)

# Train the model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Save the model and vectorizer as pickle files
with open("emotion_classification_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)


In [32]:
import os
print(os.getcwd())  # This will show the current working directory.


g:\hackathons\llm


In [33]:
import pandas as pd

# Load the dataset with the correct encoding
data = pd.read_csv(r'..\llm\llm.csv', encoding='ISO-8859-1')

# Check the column names and inspect the first few rows of the dataset
print("Columns in the dataset:", data.columns)
print(data.head())

# Try to identify the correct column name for text (it might be something like 'cleaned_text' or others)


Columns in the dataset: Index(['0', '1467810369', 'Mon Apr 06 22:19:45 PDT 2009', 'NO_QUERY',
       '_TheSpecialOne_',
       '@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D'],
      dtype='object')
   0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY _TheSpecialOne_  \
0  0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   scotthamilton   
1  0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY        mattycus   
2  0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY         ElleCTF   
3  0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY          Karoli   
4  0  1467811372  Mon Apr 06 22:20:00 PDT 2009  NO_QUERY        joy_wolf   

  @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  
0  is upset that he can't update his Facebook by ...                                                                   
1  @Kenichan I dived many times fo

In [34]:
import pandas as pd

# Load the dataset without headers and specify column names
column_names = ['id', 'timestamp', 'date', 'query', 'user', 'text', 'sentiment']  # Based on the structure of the dataset

data = pd.read_csv(r'..\llm\llm.csv', encoding='ISO-8859-1', header=None, names=column_names)

# Check the first few rows and verify the columns
print("Columns in the dataset:", data.columns)
print(data.head())

# Now, the correct columns should be identified and you can proceed
X = data['text']  # This is the tweet text column
y = data['sentiment']  # Assuming sentiment is the target column


Columns in the dataset: Index(['id', 'timestamp', 'date', 'query', 'user', 'text', 'sentiment'], dtype='object')
   id   timestamp                          date     query             user  \
0   0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY  _TheSpecialOne_   
1   0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY    scotthamilton   
2   0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY         mattycus   
3   0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY          ElleCTF   
4   0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY           Karoli   

                                                text  sentiment  
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...        NaN  
1  is upset that he can't update his Facebook by ...        NaN  
2  @Kenichan I dived many times for the ball. Man...        NaN  
3    my whole body feels itchy and like its on fire         NaN  
4  @nationwideclass no, it's not behaving at all....        NaN  


In [36]:
import pickle

with open('sentiment_model.pkl', 'wb') as model_file:
    pickle.dump(loaded_model, model_file)

with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(loaded_vectorizer, vectorizer_file)


In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer on your dataset (assuming your DataFrame is 'df')
vectorizer.fit(df['text'])

# Now you can transform new text or continue with your processing
text_vectorized = vectorizer.transform(sample_text)


In [47]:
vectorizer.fit(df['text'])  # Replace 'df' with the correct variable name containing your dataset


In [48]:
# Transform the training text data into numerical features
X_train = vectorizer.transform(df['text'])


In [52]:
df_test = pd.read_csv("G:\\hackathons\\llm\\llm.csv", encoding='latin1')


In [54]:
print(df_test.columns)


Index(['0', '1467810369', 'Mon Apr 06 22:19:45 PDT 2009', 'NO_QUERY',
       '_TheSpecialOne_',
       '@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D'],
      dtype='object')


In [56]:
vectorizer.fit(df['cleaned_text'])  # or df['text'], depending on your use case


In [57]:
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)


In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pickle

# Initialize the TF-IDF vectorizer with the desired feature count
vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer on the training data
X_train_vec = vectorizer.fit_transform(df['cleaned_text'])

# Train the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, df['sentiment'])

# Save the model and vectorizer
with open('sentiment_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)


In [59]:
vectorizer = TfidfVectorizer(max_features=8)  # Limiting the features to match model expectations


In [69]:
import joblib  # If you saved the model
import numpy as np

# Load your trained model (if not already in memory)
model = joblib.load("sentiment_model.pkl")  # Change to your model's filename

# Load vectorizer (if saved separately)
vectorizer = joblib.load("vectorizer.pkl")  # Change as needed

# Example user input
user_input = "I'm not sure how I feel about this."

# Preprocess: Convert text to vectorized form
user_input_vectorized = vectorizer.transform([user_input])  # Ensure it's a list

# Get probability predictions
probs = model.predict_proba(user_input_vectorized)

# Determine sentiment based on the threshold
if np.max(probs[0]) < 0.6:  # Adjust threshold as needed
    sentiment = "Neutral"
else:
    sentiment = model.predict(user_input_vectorized)[0]  # Get predicted sentiment

# Output result
print("Predicted Sentiment:", sentiment)


Predicted Sentiment: 0


In [70]:
print("Predicted Probabilities:", probs)
print("Max Probability:", np.max(probs[0]))


Predicted Probabilities: [[0.79845062 0.20154938]]
Max Probability: 0.7984506248162111


In [71]:
print("Model Classes:", model.classes_)


Model Classes: [0 4]


In [72]:
if np.max(probs[0]) < 0.6:  
    sentiment = "Neutral"
else:
    sentiment = "Positive" if model.predict(user_input_vectorized)[0] == 4 else "Negative"

print("Predicted Sentiment:", sentiment)


Predicted Sentiment: Negative


In [73]:
import numpy as np

probs = model.predict_proba(user_input_vectorized)
max_prob = np.max(probs[0])  # Get highest probability score

if max_prob < 0.6:  
    sentiment = "Neutral"
else:
    predicted_label = model.predict(user_input_vectorized)[0]
    sentiment = "Positive" if predicted_label == 4 else "Negative"

print("Predicted Sentiment:", sentiment)


Predicted Sentiment: Negative
