In [1]:
import speech_recognition as sr
import pandas as pd
import os
import numpy as np
import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay


In [3]:
df = pd.read_csv('tweet.csv')

In [4]:
df.isnull().sum()

textID              0
text                1
selected_text       1
sentiment           0
Time of Tweet       0
Age of User         0
Country             0
Population -2020    0
Land Area (Km²)     0
Density (P/Km²)     0
dtype: int64

In [5]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [6]:
df = df.dropna(subset=['text'])

In [7]:
df.isnull().sum()

textID              0
text                0
selected_text       0
sentiment           0
Time of Tweet       0
Age of User         0
Country             0
Population -2020    0
Land Area (Km²)     0
Density (P/Km²)     0
dtype: int64

In [8]:

df = df[['text', 'sentiment']]
df.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [9]:

sentiment_mapping = {
    'positive': 1,
    'negative': -1,
    'neutral': 0
}

df['sentiment'] = df['sentiment'].map(sentiment_mapping)


In [10]:
df

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",0
1,Sooo SAD I will miss you here in San Diego!!!,-1
2,my boss is bullying me...,-1
3,what interview! leave me alone,-1
4,"Sons of ****, why couldn`t they put them on t...",-1
...,...,...
27476,wish we could come see u on Denver husband l...,-1
27477,I`ve wondered about rake to. The client has ...,-1
27478,Yay good for both of you. Enjoy the break - y...,1
27479,But it was worth it ****.,1


In [11]:
nltk.download('stopwords')
ps = PorterStemmer()
all_stopwords = stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
def preprocess_text(text):
    if pd.isnull(text):
        return ''
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = word_tokenize(text)
    words = [ps.stem(w) for w in words if w not in stop_words]
    preprocessed_text = ' '.join(words)
    return preprocessed_text
df['preprocessed_text'] = df['text'].apply(preprocess_text)


In [13]:
df['sentiment'].value_counts()

sentiment
 0    11117
 1     8582
-1     7781
Name: count, dtype: int64

In [14]:
df['preprocessed_text'] 

0                                            id respond go
1                                  sooo sad miss san diego
2                                               boss bulli
3                                      interview leav alon
4                    son couldnt put releas alreadi bought
                               ...                        
27476    wish could come see u denver husband lost job ...
27477    ive wonder rake client made clear net dont for...
27478    yay good enjoy break probabl need hectic weeke...
27479                                                worth
27480                           flirt go atg smile yay hug
Name: preprocessed_text, Length: 27480, dtype: object

In [15]:
df.head()

Unnamed: 0,text,sentiment,preprocessed_text
0,"I`d have responded, if I were going",0,id respond go
1,Sooo SAD I will miss you here in San Diego!!!,-1,sooo sad miss san diego
2,my boss is bullying me...,-1,boss bulli
3,what interview! leave me alone,-1,interview leav alon
4,"Sons of ****, why couldn`t they put them on t...",-1,son couldnt put releas alreadi bought


In [16]:
X=df['preprocessed_text']
y=df['sentiment']

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train

24567    cant stop playin head pussycat doll jai ho ell...
24619                                                 hate
19766                                    starbuck im lovin
21737                                     ben jerryyummmmi
8980                     wow purpl leopard skin fieeerrcee
                               ...                        
13124    thank shout outyou might right starv thing lit...
19649                             im go look like sun gone
9846     soooo say phrase stop fals thank hateryoda per...
10800                                   morn love day last
2733     editor read nod writer read take note httptiny...
Name: preprocessed_text, Length: 21984, dtype: object

In [18]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [19]:
model_rf=RandomForestClassifier()
model_rf.fit(X_train_vec,y_train)
preds = model_rf.predict(X_test_vec)
print(accuracy_score(y_test,preds))

0.6957787481804949


In [22]:
recognizer = sr.Recognizer()
def audio_to_text(audio_path):
    with sr.AudioFile(audio_path) as source:
        audio_data = recognizer.record(source)
        text = recognizer.recognize_google(audio_data)
        return text

audio_text = audio_to_text('happy2.wav')

def preprocess_text(text):
    text = text.lower()
    text = ''.join(char for char in text if char.isalnum() or char.isspace())
    return text

preprocessed_text = preprocess_text(audio_text)

print("Transcript:")
print(preprocessed_text)

# Vectorize the text data using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)



model_rf = RandomForestClassifier()
model_rf.fit(X_train_tfidf, y_train)

# Vectorizing
review_vector = tfidf_vectorizer.transform([preprocessed_text])


sentiment_prediction_rf = model_rf.predict(review_vector)

if sentiment_prediction_rf == 1:
    print('Random Forest: Positive')
elif sentiment_prediction_rf == 0:
    print('Random Forest: Neutral')
else:
    print('Random Forest: Negative')

predictions_rf = model_rf.predict(X_test_tfidf)

accuracy_rf = accuracy_score(y_test, predictions_rf)

print(f'Random Forest Accuracy: {accuracy_rf:.2f}')


FileNotFoundError: [Errno 2] No such file or directory: 'happy2.wav'

In [None]:
audio_files = [
    'angry6.wav',
    'angry5.wav',
    'happy5(disgust).wav',
    'neutral5(sad).wav',
    'neutral4.wav',
    'happy4(neutraldisgustsad).wav',
    'negative(neutral).wav',
    'angry3.wav',
    'silences(happy).wav',
    'happy3.wav',
    'angry4.wav',
    'neutral3(sad).wav',
    'happy2.wav',
    'neutral2(sad).wav',
    'happy.wav'
]

# Store results in a list of dictionaries
results = []

# Load and preprocess the audio transcripts
for audio_file in audio_files:
    audio_text = audio_to_text(audio_file)
    preprocessed_text = preprocess_text(audio_text)
    
    # Vectorize the text data using TfidfVectorizer
    review_vector = tfidf_vectorizer.transform([preprocessed_text])

    # Predict sentiment using the Random Forest model
    sentiment_prediction_rf = model_rf.predict(review_vector)
    
    # Determine sentiment label
    if sentiment_prediction_rf == 1:
        sentiment_label = 'Positive'
    elif sentiment_prediction_rf == 0:
        sentiment_label = 'Neutral'
    else:
        sentiment_label = 'Negative'
    
    # Store results in a dictionary
    result_entry = {
        'Audio File': audio_file,
        'Transcript': audio_text,
        'Preprocessed Text': preprocessed_text,
        'Sentiment': sentiment_label
    }
    
    # Append the result entry to the results list
    results.append(result_entry)

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Save the results to a CSV file
results_df.to_csv('audio_sentiment_results.csv', index=False)

In [None]:
import pickle
pickle.dump(tfidf_vectorizer,open('vectorizer1.pkl','wb'))
pickle.dump(model_rf,open('model1.pkl','wb'))