## Sentiment analysis Using linear regression
1. Download and import dataset
2. Clean, rename columns, check data
3. Tokenize words
4. Train model
5. Check accuracy
6. Run AUC, F1
 
 
**Extras**
- Build Binary classifier
- Interpet results
- Print words with the most positive and negative weights

In [None]:
!wget -nc https://lazyprogrammer.me/course_files/AirlineTweets.csv

In [None]:
# import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, auc, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np


In [None]:
threshold = 0.5 # confidence in sentiment threshold 

def process_string(s):
    # Split the string on spaces
    words = s.split()
    
    # Remove one element (e.g., the second element, index 1)

    words.pop(0)
    
    # Rejoin the list into a string
    return ' '.join(words)


df = pd.read_csv('AirlineTweets.csv',delimiter=",")
df = df[df['airline_sentiment_confidence']>threshold]
df = df[['airline_sentiment','text']]

# Apply the function to each row in the DataFrame
df['text'] = df['text'].apply(process_string)

sentiment = df["airline_sentiment"]
text = df["text"]

# Display the first few rows of the DataFrame
df.head()
print(f"Number of samples: {len(df)}")

In [None]:
#TODO: create a function that splits the data but keeps the same percentage of each class in both train and test
# Check for class imbalance

test_plots = df['airline_sentiment'].value_counts()
test_plots.plot(kind='bar', color='skyblue', alpha=0.7)
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(text, sentiment, test_size=0.3, random_state=42)


tfidf_vectorizer = TfidfVectorizer()
count_vectorizer = CountVectorizer()

# function .fit_transform() trains on a new vocabulary, 
# while .transform() trains on existing vocabulary
X_train_vect = tfidf_vectorizer.fit_transform(X_train)
X_test_vect = tfidf_vectorizer.transform(X_test)


In [None]:
# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vect, y_train)

In [36]:
preds = model.predict_proba(X_test_vect)
probs = ["negative","neuteral","positive"]

negative_sentiment = {}
neuteral_sentiment = {}
positive_sentiment = {}

#accuracy = accuracy_score(y_test,preds)
#print(f"model accuracy:{accuracy}")

for i,pred in enumerate(preds):
  
    index = np.argmax(pred)
    
    if index == 0:
        negative_sentiment[X_test.iloc[i]] = pred[index]
    
    if index == 1:
        neuteral_sentiment[X_test.iloc[i]] = pred[index]
    
    if index == 2:
        positive_sentiment[X_test.iloc[i]] = pred[index]
    
        
print(f"pos: {len(positive_sentiment)} , neg: {len(negative_sentiment)}, neut: {len(neuteral_sentiment)}")

pos: 460 , neg: 3186, neut: 636


In [40]:
# get most positive review
max_key = max(positive_sentiment, key=positive_sentiment.get)
print(max_key)


Great - thank you.


In [44]:
# Sample custom text for prediction
custom_text = ["it sucked so bad", "it was awesome"]

# Transform the custom text using the same vectorizer
custom_text_vectorized = tfidf_vectorizer.transform(custom_text)

# Make the prediction
predictions = model.predict(custom_text_vectorized)

# Display the predictions
for text, prediction in zip(custom_text, predictions):
    print(f"Text: {text}\nPrediction: {prediction}\n")

Text: it sucked so bad
Prediction: negative

Text: it was awesome
Prediction: positive

