# Sentiment Analysis
## Install the required packages

In [1]:
! pip install pandas statsmodels numpy scikit-learn scipy seaborn



In [2]:
# Let's start by importing the necessary libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy import stats
from sklearn.preprocessing import StandardScaler

## Calculate Metrics Per Transcript

In [12]:
import one_off

# # Approximately takes 2:30 / item
# all_transcripts_df = one_off.main()

# all_transcripts_df

# Use the saved data to avoid running the code above
all_transcripts_df = pd.read_csv('transcript_dfs/all_transcripts_df.csv')

all_transcripts_df.head()

Unnamed: 0,id,title,ae_name,ae_email,sales_outcome,date,meeting_attendees,host_email,transcript_url,video_url,...,sadness,surprise,sentiment_balance_ratio,ae_sentiment,client_sentiment,ae_sentiment_variability,client_sentiment_variability,ae_sentiment_trend,client_sentiment_trend,emotional_reciprocity
0,noufJNYz4sHSwwYc,Mohammad Nadeem Karim <> My Amazon Guy,Shawn Henderson,shawn.henderson@myamazonguy.com,closed_won,2023/07/24,"audray.alcordo@myamazonguy.com, shawn.henderso...",audray.alcordo@myamazonguy.com,https://app.fireflies.ai/view/noufJNYz4sHSwwYc,https://cdn.fireflies.ai/noufJNYz4sHSwwYc/vide...,...,0.024185,0.075664,0.871395,0.02477,0.007904,0.338995,0.394618,-0.000262,0.000149,0.160776
1,iPxunfPaFxy32jXq,Chris Meet with My Amazon Guy,Shawn Henderson,shawn.henderson@myamazonguy.com,closed_won,2023/04/13,"heidelmanc@gmail.com, shawn.henderson@myamazon...",shawn.henderson@myamazonguy.com,https://app.fireflies.ai/view/iPxunfPaFxy32jXq,https://cdn.fireflies.ai/iPxunfPaFxy32jXq/vide...,...,0.044437,0.069202,0.480661,0.011196,0.0,0.404633,0.0,0.000192,0.0,0.0
2,WJ436UP28tj1eR4p,Chris Meet with My Amazon Guy,Shawn Henderson,shawn.henderson@myamazonguy.com,closed_won,2023/05/12,"heidelmanc@gmail.com, shawn.henderson@myamazon...",shawn.henderson@myamazonguy.com,https://app.fireflies.ai/view/WJ436UP28tj1eR4p,https://cdn.fireflies.ai/WJ436UP28tj1eR4p/vide...,...,0.033103,0.077283,0.612445,0.044442,0.001328,0.382006,0.444766,5.8e-05,8.9e-05,0.091747
3,trQWJhhuamfX6Yxz,Chris Meet with My Amazon Guy,Shawn Henderson,shawn.henderson@myamazonguy.com,closed_won,2023/03/16,"chris@mymopshop.com, shawn.henderson@myamazong...",shawn.henderson@myamazonguy.com,https://app.fireflies.ai/view/trQWJhhuamfX6Yxz,https://cdn.fireflies.ai/trQWJhhuamfX6Yxz/vide...,...,0.030286,0.081729,0.509669,0.044274,-0.027803,0.350769,0.468817,0.0003,0.000533,0.10765
4,6SQGaUHHiwdrAvm1,kevin <> My Amazon Guy,John Aspinall,john.aspinall@myamazonguy.com,closed_won,2023/08/21,"sddrkevin@yahoo.com, john.aspinall@myamazonguy...",john.aspinall@myamazonguy.com,https://app.fireflies.ai/view/6SQGaUHHiwdrAvm1,https://cdn.fireflies.ai/6SQGaUHHiwdrAvm1/vide...,...,0.048736,0.082774,0.161487,-0.00469,-0.009098,0.346661,0.455024,-0.000228,0.001094,0.270022


## Clean Data
I have cleaned the data by removing ratios that are either 0.0 or 1.0. These ratios are not useful for the analysis as they do not provide any information about a conversation between two people.

In [13]:
# Save all_transcripts_df to a csv file
all_transcripts_df.to_csv('transcript_dfs/all_transcripts_df.csv', index=False)

# Remove rows that have 1.0 or 0.0 as the value for the ae_talk_ratio
test_df = all_transcripts_df.loc[(all_transcripts_df['ae_talk_ratio'] != 1.0) & (all_transcripts_df['ae_talk_ratio'] != 0.0)]

test_df.head()

Unnamed: 0,id,title,ae_name,ae_email,sales_outcome,date,meeting_attendees,host_email,transcript_url,video_url,...,sadness,surprise,sentiment_balance_ratio,ae_sentiment,client_sentiment,ae_sentiment_variability,client_sentiment_variability,ae_sentiment_trend,client_sentiment_trend,emotional_reciprocity
0,noufJNYz4sHSwwYc,Mohammad Nadeem Karim <> My Amazon Guy,Shawn Henderson,shawn.henderson@myamazonguy.com,closed_won,2023/07/24,"audray.alcordo@myamazonguy.com, shawn.henderso...",audray.alcordo@myamazonguy.com,https://app.fireflies.ai/view/noufJNYz4sHSwwYc,https://cdn.fireflies.ai/noufJNYz4sHSwwYc/vide...,...,0.024185,0.075664,0.871395,0.02477,0.007904,0.338995,0.394618,-0.000262,0.000149,0.160776
2,WJ436UP28tj1eR4p,Chris Meet with My Amazon Guy,Shawn Henderson,shawn.henderson@myamazonguy.com,closed_won,2023/05/12,"heidelmanc@gmail.com, shawn.henderson@myamazon...",shawn.henderson@myamazonguy.com,https://app.fireflies.ai/view/WJ436UP28tj1eR4p,https://cdn.fireflies.ai/WJ436UP28tj1eR4p/vide...,...,0.033103,0.077283,0.612445,0.044442,0.001328,0.382006,0.444766,5.8e-05,8.9e-05,0.091747
3,trQWJhhuamfX6Yxz,Chris Meet with My Amazon Guy,Shawn Henderson,shawn.henderson@myamazonguy.com,closed_won,2023/03/16,"chris@mymopshop.com, shawn.henderson@myamazong...",shawn.henderson@myamazonguy.com,https://app.fireflies.ai/view/trQWJhhuamfX6Yxz,https://cdn.fireflies.ai/trQWJhhuamfX6Yxz/vide...,...,0.030286,0.081729,0.509669,0.044274,-0.027803,0.350769,0.468817,0.0003,0.000533,0.10765
4,6SQGaUHHiwdrAvm1,kevin <> My Amazon Guy,John Aspinall,john.aspinall@myamazonguy.com,closed_won,2023/08/21,"sddrkevin@yahoo.com, john.aspinall@myamazonguy...",john.aspinall@myamazonguy.com,https://app.fireflies.ai/view/6SQGaUHHiwdrAvm1,https://cdn.fireflies.ai/6SQGaUHHiwdrAvm1/vide...,...,0.048736,0.082774,0.161487,-0.00469,-0.009098,0.346661,0.455024,-0.000228,0.001094,0.270022
5,70F1yZvLPwgEn2i0,Ongaro Beauty <> My Amazon Guy,Shawn Henderson,shawn.henderson@myamazonguy.com,closed_won,2023/07/06,"airine.francisco@myamazonguy.com, brian@ongaro...",airine.francisco@myamazonguy.com,https://app.fireflies.ai/view/70F1yZvLPwgEn2i0,https://cdn.fireflies.ai/70F1yZvLPwgEn2i0/vide...,...,0.019127,0.084997,0.998057,0.04848,0.02226,0.388969,0.479764,3.6e-05,0.000313,0.400392


## Statistical Testing

## Correlation Analysis

In [5]:
# talk_listen_df['sales_outcome_binary'] = talk_listen_df['sales_outcome'].map({'closed_won': 1, 'closed_lost': 0})

# # Select variables for analysis
# variables = [
#     'ae_talk_ratio', 'total_duration',
#     'ae_talk_duration', 'client_talk_duration'
# ]

# # Initialize lists to store results
# corr_list = []

# # Calculate correlation coefficients and p-values
# for var in variables:
#     corr_coef, p_value = stats.pearsonr(talk_listen_df[var], talk_listen_df['sales_outcome_binary'])
#     corr_list.append({
#         'Variable': var,
#         'Correlation with Sales Outcome': corr_coef,
#         'P-value (Correlation)': p_value
#     })

# # Create a DataFrame from the list
# corr_df = pd.DataFrame(corr_list)


## Logistic Regression

In [6]:
# # Logistic Regression
# # Define predictors and target variable
# X = talk_listen_df[variables]
# y = talk_listen_df['sales_outcome_binary']

# # Standardize the predictors
# scaler = StandardScaler()
# X_scaled_array = scaler.fit_transform(X)
# X_scaled = pd.DataFrame(X_scaled_array, columns=X.columns, index=X.index)

# # Add constant term for intercept
# X_scaled = sm.add_constant(X_scaled)

# # Align X_scaled and y to ensure indices match
# X_scaled, y = X_scaled.align(y, join='inner', axis=0)

# # Fit the logistic regression model
# logit_model = sm.Logit(y, X_scaled)
# result = logit_model.fit(method='newton', maxiter=100)

# # Get the summary of regression results
# regression_summary = result.summary2().tables[1]
# print(result.summary2())



In [7]:
# # Calculate VIF for each variable
# from statsmodels.stats.outliers_influence import variance_inflation_factor

# # Assuming X_scaled is your predictors DataFrame
# vif_data = pd.DataFrame()
# vif_data['Variable'] = X_scaled.columns
# vif_data['VIF'] = [variance_inflation_factor(X_scaled.values, i) for i in range(X_scaled.shape[1])]

# print(vif_data)


In [8]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# corr_matrix = X_scaled.corr()
# plt.figure(figsize=(12, 10))
# sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')
# plt.show()


## Regression Coefficients and P-values

In [9]:
# # Reset index to turn 'Variable' into a column
# regression_df = regression_summary.reset_index()

# # Rename columns for clarity
# regression_df.rename(columns={
#     'index': 'Variable',
#     'Coef.': 'Regression Coefficient',
#     'P>|z|': 'P-value (Regression)'
# }, inplace=True)

# # Remove 'const' from the variables
# regression_df = regression_df[regression_df['Variable'] != 'const']


## Interpret Results

In [10]:
# # Merge correlation and regression data
# merged_df = pd.merge(
#     corr_df,
#     regression_df[['Variable', 'Regression Coefficient', 'P-value (Regression)']],
#     on='Variable',
#     how='left'
# )

# # Define interpretation based on p-value
# def interpret_p_value(p_value):
#     if pd.isnull(p_value):
#         return 'N/A'
#     elif p_value < 0.05:
#         return 'Good Predictor'
#     else:
#         return 'Not a Good Predictor'

# # Apply the interpretation to the merged DataFrame
# merged_df['Interpretation'] = merged_df['P-value (Regression)'].apply(interpret_p_value)

## Present Results

In [11]:
# # Rearranging columns for presentation
# final_df = merged_df[[
#     'Variable',
#     'Correlation with Sales Outcome',
#     'P-value (Correlation)',
#     'Regression Coefficient',
#     'P-value (Regression)',
#     'Interpretation'
# ]]

# # Format numerical values for better readability
# final_df['Correlation with Sales Outcome'] = final_df['Correlation with Sales Outcome'].round(4)
# final_df['P-value (Correlation)'] = final_df['P-value (Correlation)'].apply(lambda x: f"{x:.3f}")
# final_df['Regression Coefficient'] = final_df['Regression Coefficient'].round(4)
# final_df['P-value (Regression)'] = final_df['P-value (Regression)'].apply(lambda x: f"{x:.3f}" if not pd.isnull(x) else 'N/A')

# # Display the final table
# final_df
