In [4]:
import pandas as pd
import pymysql
import re
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

In [5]:
# Function to create a database connection using pymysql
def create_connection():
    return pymysql.connect(host='database-1.csopvl4k4p5e.us-east-1.rds.amazonaws.com',
                           user='admin',
                           password='llmtest123',
                           database='LLMProject',
                           port=3306,
                           charset='utf8mb4',
                           cursorclass=pymysql.cursors.DictCursor)

In [6]:
def clean_text(text):
    # Normalize text by encoding to ASCII and decoding back to string
    text = text.encode('ascii', 'ignore').decode('ascii')
    # Remove non-alphanumeric characters and unnecessary spaces
    text = re.sub(r'[^a-zA-Z0-9\s,.?!]', '', text)
    text = text.strip()
    return text

In [7]:
# Define a function to load and clean the dataset
def load_and_clean_data():
    connection = create_connection()
    df = pd.DataFrame()
    try:
        with connection.cursor() as cursor:
            sql = 'SELECT * FROM reddit_hn'  # Change to select all columns
            cursor.execute(sql)
            rows = cursor.fetchall()  # Fetch all the results
            df = pd.DataFrame(rows)  # Convert to DataFrame
    finally:
        connection.close()

    # Check if the DataFrame is not empty and the columns exist before cleaning
    if not df.empty and 'SubmissionTitle' in df.columns and 'Text' in df.columns:
        df['SubmissionTitle'] = df['SubmissionTitle'].fillna('').apply(clean_text)
        df['Text'] = df['Text'].fillna('').apply(clean_text)

        # Optionally, remove rows where 'SubmissionTitle' or 'Text' are empty strings after cleaning
        df = df[(df['SubmissionTitle'] != '') & (df['Text'] != '')]

    return df

In [8]:
# Load and clean the data
cleaned_df = load_and_clean_data()
print(cleaned_df.head())  # Display the first few rows of the cleaned dataframe
print(cleaned_df.shape[0])

          CreatedTime SubmissionID  \
0 2023-04-11 23:58:09      12j2q08   
1 2023-04-12 05:16:08      12jb0p7   
2 2023-04-12 05:21:23      12jb52v   
3 2023-04-12 09:51:42      12jgw3i   
4 2023-04-12 13:29:18      12jmb5e   

                                     SubmissionTitle  \
0  How come Chatgp3 is terrible at analyzing gram...   
1  Just Released an OpenSource Tool to Help Test ...   
2  Is OpenAIs Study On The Labor Market Impacts O...   
3  How do companies tackle observability, bias, a...   
4         Do LLMs retain information interlingually?   

                                                Text  \
0  So I have been playing around with GP3 alot re...   
1  I pushed out some simple code for running expe...   
2  Example imgnamehttpspreview.redd.itsqjd5aiu1et...   
3  Hey rGPT3 community!\n\nIve been diving into t...   
4  If an LLM like GPT4 is fed information in one ...   

                                       SubmissionURL  Score  NumberOfComments  \
0  /r/GPT3/comme

In [9]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

# Initialize the Hugging Face sentiment-analysis pipeline with RoBERTa model
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

    PyTorch 2.2.2+cu121 with CUDA 1201 (you have 2.2.2+cpu)
    Python  3.10.11 (you have 3.10.6)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


In [10]:
# Function to handle long texts and classify sentiment
def get_sentiment(text):
    try:
        result = sentiment_pipeline(text[:512])  # Truncate text to 512 characters
        sentiment_label = result[0]['label']
        sentiment_score = result[0]['score']

        # Map model-specific output labels to generic labels
        if sentiment_label == 'LABEL_0':
            sentiment = 'NEGATIVE'
        elif sentiment_label == 'LABEL_1':
            sentiment = 'NEUTRAL'
        else: 
            sentiment = 'POSITIVE'
        
        return sentiment, sentiment_score
    except Exception as e:
        print(f"Error processing text: {e}")
        return "ERROR", 0

In [11]:
def save_to_database(df, original_table_name, new_table_name):
    connection = create_connection()
    try:
        with connection.cursor() as cursor:
            # Attempt to retrieve the structure of the original table
            try:
                cursor.execute(f"SHOW CREATE TABLE {original_table_name}")
                create_table_query = cursor.fetchone()['Create Table']
            except Exception as e:
                raise Exception(f"Error retrieving table structure: {e}")

            # Modify the table structure query to create a new table
            create_table_query = create_table_query.replace(original_table_name, new_table_name)
            pos = create_table_query.rfind(')')
            create_table_query = create_table_query[:pos] + ', Sentiment VARCHAR(10), Sentiment_Score FLOAT' + create_table_query[pos:]

            # Create the new table
            try:
                cursor.execute(f"DROP TABLE IF EXISTS {new_table_name}")
                cursor.execute(create_table_query)
            except Exception as e:
                raise Exception(f"Error creating new table: {e}")

            # Prepare and execute the insert statement
            fields = list(df.columns)
            placeholders = ', '.join(['%s'] * len(fields))
            insert_query = f"INSERT INTO {new_table_name} ({', '.join(fields)}) VALUES ({placeholders})"
#             try:
#                 for _, row in df.iterrows():
#                     values = tuple(row[field] if field in row else None for field in fields)
#                     cursor.execute(insert_query, values)
#                 connection.commit()
#             except Exception as e:
#                 connection.rollback()  # Rollback in case of error
#                 raise Exception(f"Error inserting data: {e}")

            # Prepare data for bulk insert
            data_to_insert = [tuple(row[field] for field in fields) for index, row in df.iterrows()]

            # Execute bulk insert
            cursor.executemany(insert_query, data_to_insert)
            
            connection.commit()
    except Exception as final_error:
        print(f"Database operation failed: {final_error}")
    finally:
        connection.close()


In [49]:
# Apply the sentiment analysis function to the Text column
cleaned_df[['Sentiment', 'Sentiment_Score']] = cleaned_df['Text'].apply(lambda x: pd.Series(get_sentiment(x)))

In [50]:
# Save the DataFrame with sentiment data to the new table
save_to_database(cleaned_df, 'reddit_hn', 'sentiment_analysis')

In [12]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Function to load annotated CSV
def load_annotated_data(file_path):S
    return pd.read_csv(file_path)

# Function to predict sentiment using your existing pipeline
def predict_sentiment(df, text_column):
    sentiments = df[text_column].apply(lambda x: get_sentiment(x)[0])
    return sentiments

# Function to evaluate the model
def evaluate_model(predictions, annotations):
    accuracy = accuracy_score(annotations, predictions)
    precision, recall, fscore, _ = precision_recall_fscore_support(annotations, predictions, average='weighted')
    return accuracy, precision, recall, fscore

# Load the annotated data
annotated_data = load_annotated_data("sentiment_reddit_data.csv")

# Predict the sentiment
predicted_sentiments = predict_sentiment(annotated_data, 'Text')

# Evaluate the model
accuracy, precision, recall, fscore = evaluate_model(predicted_sentiments, annotated_data['Annotated_Sentiment'])

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {fscore}")


Accuracy: 0.65
Precision: 0.6950000000000001
Recall: 0.65
F1-Score: 0.6573975044563279
