In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('vader_lexicon')
nltk.download('punkt_tab')
nltk.download('stopwords')

In [None]:
complaints_df = pd.read_csv('complaints.csv')

In [None]:
complaints_df['Date received'] = pd.to_datetime(complaints_df['Date received'], format='%m/%d/%Y', errors='coerce')
complaints_df['Date sent to company'] = pd.to_datetime(complaints_df['Date sent to company'], format='%m/%d/%Y', errors='coerce')

In [None]:
if 'Tags' in complaints_df.columns:
    complaints_df['Tags'] = complaints_df['Tags'].str.split(', ')
    complaints_df = complaints_df.explode('Tags')

In [None]:
complaints_df = complaints_df.replace('', np.nan)

In [None]:
for col in complaints_df.select_dtypes(include='object').columns:
    complaints_df[col] = complaints_df[col].replace('N/A', np.nan)

In [None]:
unique_counts = complaints_df.nunique()
print("\nUnique value counts per column:")
print(unique_counts)

In [None]:
top_5_companies = complaints_df['Company'].value_counts().head(5)
print("\nTop 5 companies with the most records:")
print(top_5_companies)

In [None]:
if 'Consumer complaint narrative' not in complaints_df.columns:
    print("\nError: 'Consumer complaint narrative' column not found. Cannot proceed with sentiment analysis.")
    exit()

In [None]:
jpm_complaints_df = complaints_df[complaints_df['Company'] == "JPMORGAN CHASE & CO."].copy()
jpm_complaints_df['tokenized_narrative'] = jpm_complaints_df['Consumer complaint narrative'].astype(str).apply(word_tokenize)

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
jpm_complaints_df['sentiment_scores'] = jpm_complaints_df['Consumer complaint narrative'].astype(str).apply(sid.polarity_scores)
jpm_complaints_df['compound_sentiment'] = jpm_complaints_df['sentiment_scores'].apply(lambda x: x['compound'])
jpm_complaints_df['negative_sentiment'] = jpm_complaints_df['sentiment_scores'].apply(lambda x: x['neg'])
stop_words = set(stopwords.words('english'))
negative_words_list = []
for index, row in jpm_complaints_df.iterrows():
    if row['negative_sentiment'] > 0.5: # A threshold for 'highly negative'
        tokens = [word.lower() for word in row['tokenized_narrative'] if word.isalpha() and word.lower() not in stop_words]
        negative_words_list.extend(tokens)
negative_word_count = Counter(negative_words_list) # Top 50 negative words
print("\nTop negative words for JPMorgan Chase & Co. (based on VADER):")
print(negative_word_count)

In [None]:
if negative_word_count:
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(dict(negative_word_count))
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Negative Word Cloud for JPMorgan Chase & Co. Complaints')
    plt.show()
else:
    print("No sufficiently negative words found to generate a word cloud.")

In [None]:
product_sentiment = jpm_complaints_df.groupby('Product')['compound_sentiment'].mean().reset_index()
product_sentiment = product_sentiment.sort_values(by='compound_sentiment', ascending=False)

plt.figure(figsize=(12, 7))
sns.barplot(x='Product', y='compound_sentiment', data=product_sentiment, palette='viridis')
plt.title('Average Compound Sentiment by Product for JPMorgan Chase & Co.')
plt.xlabel('Product')
plt.ylabel('Average Compound Sentiment')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Map labels for plotting
emotion_by_dispute['Complaint Status'] = emotion_by_dispute['Consumer disputed'].map({'No': 'Not Disputed', 'Yes': 'Disputed'})

plt.figure(figsize=(10, 6))
sns.barplot(
    data=emotion_by_dispute,
    x='sentiment',
    y='avg_proportion',
    hue='Complaint Status',
    palette={'Not Disputed': '#E74C3C', 'Disputed': '#3498DB'}
)

plt.title("Emotional Content in Disputed vs. Non-Disputed Complaints\nComparison of normalized emotion scores")
plt.xlabel("Emotion")
plt.ylabel("Average Proportion of Words")
plt.xticks(rotation=45, ha='right')
plt.legend(title='Complaint Status')
sns.despine()
plt.tight_layout()
plt.show()


In [None]:
import statsmodels.api as sm

# Assuming emotion_wide is a pandas DataFrame and 'binary_dispute' is the target column
X = emotion_wide[['anger', 'fear', 'joy', 'sadness', 'trust', 'surprise', 'anticipation', 'disgust']]
X = sm.add_constant(X)  # Adds the intercept
y = emotion_wide['binary_dispute']

# Fit logistic regression model
dispute_model = sm.Logit(y, X).fit()

# To view a summary like R's summary()
print(dispute_model.summary())


In [None]:
# Print the model summary, which includes the likelihood ratio test
print(dispute_model.summary())

# Or extract likelihood ratio test statistic and p-value
lrt_stat = dispute_model.llr
lrt_pvalue = dispute_model.llr_pvalue

print("Likelihood Ratio Chi-Square Statistic:", lrt_stat)
print("p-value:", lrt_pvalue)
