## Sentiment analysis Using linear regression
1. Download and import dataset
2. Clean, rename columns, check data
3. Tokenize words
4. Train model
5. Check accuracy
6. Run AUC, F1
 
 
**Extras**
- Build Binary classifier (positive/negative)
- Interpet results
- Print words with the most positive and negative weights

In [None]:
!wget -nc https://lazyprogrammer.me/course_files/AirlineTweets.csv

In [None]:
# import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, auc, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
threshold = 0.5 # confidence in sentiment threshold 

def process_string(s):
    # Split the string on spaces
    words = s.split()
    
    # Remove first element
    words.pop(0)
    
    # Rejoin the list into a string
    return ' '.join(words)


df = pd.read_csv('AirlineTweets.csv',delimiter=",")
df = df[df['airline_sentiment_confidence']>threshold]
df = df[['airline_sentiment','text']]

# Apply the function to each row in the DataFrame
df['text'] = df['text'].apply(process_string)

sentiment = df["airline_sentiment"]
text = df["text"]

# Display the first few rows of the DataFrame
print(df.head())
print(f"Number of samples: {len(df)}")

## Convert classes to numbers
#target_map = {"pos":1,"neg":0,"neut":2}
#df['airline_sentiment'] = df['airline_sentiment'].map(target_map)


In [None]:
#TODO: create a function that splits the data but keeps the same percentage of each class in both train and test
# Check for class imbalance

test_plots = df['airline_sentiment'].hist()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(text, sentiment, test_size=0.3, random_state=42)


tfidf_vectorizer = TfidfVectorizer(lowercase=True)
count_vectorizer = CountVectorizer()

# function .fit_transform() trains on a new vocabulary, 
# while .transform() trains on existing vocabulary
X_train_vect = tfidf_vectorizer.fit_transform(X_train)
X_test_vect = tfidf_vectorizer.transform(X_test)


In [None]:
# Train a logistic regression model
model = LogisticRegression(max_iter=1000,class_weight='balanced')
model.fit(X_train_vect, y_train)
print("Model train acc:",model.score(X_train_vect,y_train))
print("Model test acc:",model.score(X_test_vect,y_test))

In [None]:
preds = model.predict_proba(X_test_vect)
probs = ["negative","neuteral","positive"]

negative_sentiment = {}
neuteral_sentiment = {}
positive_sentiment = {}


for i,pred in enumerate(preds):
  
    index = np.argmax(pred)
    
    if index == 0:
        negative_sentiment[X_test.iloc[i]] = pred[index]
    
    if index == 1:
        neuteral_sentiment[X_test.iloc[i]] = pred[index]
    
    if index == 2:
        positive_sentiment[X_test.iloc[i]] = pred[index]
    
        
# get most positive review
max_key = max(positive_sentiment, key=positive_sentiment.get)
print(max_key)
print(positive_sentiment[max_key])


# get most negative review
max_key = max(negative_sentiment, key=negative_sentiment.get)
print(max_key)
print(negative_sentiment[max_key])



In [None]:
# Sample custom text for prediction
custom_text = ["it was mid", "it was awesome"]

# Transform the custom text using the same vectorizer
custom_text_vectorized = tfidf_vectorizer.transform(custom_text)

# Make the prediction
predictions = model.predict(custom_text_vectorized)

# Display the predictions
for text, prediction in zip(custom_text, predictions):
    print(f"Text: {text}\nPrediction: {prediction}\n")

In [None]:
#Calculate AUC Score
P_train = model.predict_proba(X_train_vect)
P_test = model.predict_proba(X_test_vect)
print("Train AUC:",roc_auc_score(y_train,P_train,multi_class='ovo'))
print("Test AUC:",roc_auc_score(y_test,P_test,multi_class='ovo'))


In [None]:
# confusion matrix
P_train = model.predict(X_train_vect)
P_test = model.predict(X_test_vect)
cm = confusion_matrix(y_train,P_train,normalize='true')

sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=probs, yticklabels=probs)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
cm = confusion_matrix(y_test,P_test,normalize='true')

sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=probs, yticklabels=probs)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()


### Build Binary classifier
Keep only negative and positive classes

In [None]:
binary_list =  df[df['airline_sentiment'] != "neutral"]

text = binary_list['text']
sentiment = binary_list['airline_sentiment']

X_train, X_test, y_train, y_test = train_test_split(text, sentiment, test_size=0.3, random_state=42)


tfidf_vectorizer = TfidfVectorizer(lowercase=True,max_features=2000)

# function .fit_transform() trains on a new vocabulary, 
# while .transform() trains on existing vocabulary
X_train_vect = tfidf_vectorizer.fit_transform(X_train)
X_test_vect = tfidf_vectorizer.transform(X_test)


model = LogisticRegression(max_iter=1000)
model.fit(X_train_vect, y_train)
print("Model train acc:",model.score(X_train_vect,y_train))
print("Model test acc:",model.score(X_test_vect,y_test))

In [None]:
#Calculate AUC Score
P_train = model.predict_proba(X_train_vect)[:,1]
P_test = model.predict_proba(X_test_vect)[:,1]
print("Train AUC:",roc_auc_score(y_train,P_train))
print("Test AUC:",roc_auc_score(y_test,P_test))


In [None]:
# confusion matrix
P_train = model.predict(X_train_vect)
P_test = model.predict(X_test_vect)
cm = confusion_matrix(y_train,P_train,normalize='true')

sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=probs, yticklabels=probs)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# model.coef to obtain the wεights
plt.hist(model.coef_[0],bins=30)


In [None]:
# word to index dictionary
word_index_map = tfidf_vectorizer.vocabulary_
word_index_map

In [None]:
# check the weghts of each words
threshold = 2
print("most positive words")

all_weights = []
all_wwords = []
for word, index in word_index_map.items():
    weight = model.coef_[0][index]
    all_weights.append(weight)
    all_wwords.append(word)
    if weight > threshold:
        print(word,weight)

In [None]:
all_df = pd.DataFrame(all_wwords,columns=['words'])
all_df['weights'] = all_weights
df_sorted = all_df.sort_values(by='weights',ascending=False)
df_sorted.head(50)


In [None]:
# check the weghts of each words
threshold = 2
print("most positive words")

for word, index in word_index_map.items():
    weight = model.coef_[0][index]
    if weight < -threshold:
        print(word,weight)

In [None]:
search_string = 'del'
filtered_df = all_df[all_df['words'].str.contains(search_string, case=False, na=False)]
filtered_df

In [None]:
# print the most wrong tweets for both classes
# find the most false positive with most confidence

Preds = model.predict_proba(X_test_vect)
pred_conf = np.amax(Preds,axis=1)




get_wrongest_preds = pd.DataFrame(pred_conf,columns = ['preds'] )
get_wrongest_preds['Y'] = y_test
get_wrongest_preds['X_test'] = X_test
get_wrongest_preds

