In [1]:
import pandas as pd
import numpy as np
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer, KeyphraseTfidfVectorizer
import spacy 
from spacy.training.example import Example
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import random
from collections import Counter
import joblib
from bertopic import BERTopic
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

  from tqdm.autonotebook import tqdm, trange
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


# Review Data
### Methods for analysis
- Sentiment analysis 
- Exporatory data analysis
- Time series
- Keywords

In [2]:
def report_of_model(nlp, data, stage):
    y_true = []
    y_pred = []
    for i, sample in data.iterrows():
        doc = nlp(sample['text'])
        true_cats = sample['cats'] if isinstance(sample['cats'], dict) else eval(sample['cats'])
        pred_cats = doc.cats
        true_label = max(true_cats, key=true_cats.get)
        pred_label = max(pred_cats, key=pred_cats.get)
        y_true.append(true_label)
        y_pred.append(pred_label)

    print(f"\n Classification Report for {stage} Data:")
    print(classification_report(y_true, y_pred))

# Creation and training of the model
# Prepare training data (replace with your labled data)
def train_and_evaluate_model(nlp, train_data, num_epochs=10):
    '''
    Example.from_dict(doc, sample):
    This creates a spaCy Example object, which pairs the processed document (doc) with its correct labels (from sample). This is how spaCy knows what the correct output should be for this example.
    
    nlp.update([gold], drop=0.5): 
    This is where the actual learning happens. It updates the model's parameters based on this example. The drop=0.5 is a dropout rate, which helps prevent overfitting.
    '''
    # needs to be in this format
    # Ensure train_data is a DataFrame
    if not isinstance(train_data, pd.DataFrame):
        raise TypeError("train_data must be a pandas DataFrame")
    
    # Check if required columns exist
    if 'text' not in train_data.columns or 'cats' not in train_data.columns:
        raise ValueError("train_data must have column labels 'text' and 'cats'")

    # split the data
    x_train, x_test = train_test_split(train_data, test_size = 0.3, random_state=42)
    x_hold, x_val = train_test_split(x_test, test_size=0.5) 

    # if there is not a predefined pipeline already added for example nlp.add_pipe("sentencizer") or nlp.add_pipe("parser")
    if 'textcat' not in nlp.pipe_names:
        textcat = nlp.add_pipe("textcat")
        for label in ["positive", "negative", "neutral"]:
            textcat.add_label(label)
    else:
        textcat = nlp.get_pipe("textcat")

    optimizer = nlp.initialize() 

    # train the model 
    print('first round of training')
    for epoch in range(num_epochs):
        shuffled_data = x_train.sample (frac =1, random_state = 42).reset_index(drop=True)
        for i, sample in shuffled_data.iterrows():
            doc = nlp.make_doc(sample['text'])
            cats = sample['cats'] if isinstance(sample['cats'], dict) else eval(sample['cats'])
            gold = Example.from_dict(doc, {"cats": cats})
            nlp.update([gold], sgd=optimizer, drop=0.5)
    print('\n Training Results: ')
    report_of_model(nlp, x_train, "Training Step")

    print('second round of training')
    for epoch in range(num_epochs):
        shuffled_data = x_val.sample (frac =1, random_state = 42).reset_index(drop=True)
        for i, sample in shuffled_data.iterrows():
            doc = nlp.make_doc(sample['text'])
            cats = sample['cats'] if isinstance(sample['cats'], dict) else eval(sample['cats'])
            gold = Example.from_dict(doc, {"cats": cats})
            nlp.update([gold], sgd=optimizer, drop=0.5)
    print('\n Validation Results')
    report_of_model(nlp, x_val, "validation Step")

    print('third round of training')
    for epoch in range(num_epochs):
        shuffled_data = x_hold.sample (frac =1, random_state = 42).reset_index(drop=True)
        for i, sample in shuffled_data.iterrows():
            doc = nlp.make_doc(sample['text'])
            cats = sample['cats'] if isinstance(sample['cats'], dict) else eval(sample['cats'])
            gold = Example.from_dict(doc, {"cats": cats})
            nlp.update([gold], sgd=optimizer, drop=0.5)
    print('\n Test Results')
    report_of_model(nlp,x_hold, "Final Test")

    joblib.dump(nlp, "sentiment_model.joblib")
    return nlp

In [3]:
# load data
raw_training_dataset = pd.read_csv('full_training_data.csv')
training_dataset = raw_training_dataset.drop_duplicates()
training_dataset.head(1)

Unnamed: 0,text,cats
0,"I actually had a good experience overall, so I...","{'positive': 1.0, 'negative': 0.0, 'neutral': ..."


In [4]:
carvana_complaints = pd.read_csv('full_carvana_complaints.csv')
carvana_complaints.head(1)

Unnamed: 0,company,bbb_company_id,phone,complaint_type,status,initial_complaint_date,complaint,Business response_1_date,Business response_1,Customer response_1_date,...,Business response_8_date,Business response_8,Customer response_5_date,Customer response_5,Customer response_6_date,Customer response_6,Customer response_7_date,Customer response_7,Customer response_8_date,Customer response_8
0,Carvana LLC,1126-1000037076,8003334554,Service or Repair Issues,Resolved: The complainant verified the issue w...,7/15/2024,I purchased a 2023 ***** CX-5 online on July 1...,7/16/2024,"Dear BBB,Thank you for bringing this situation...",7/16/2024,...,,,,,,,,,,


In [5]:
carvana_reviews = pd.read_csv("carvana_reviews.csv")
carvana_reviews.head(1)

Unnamed: 0.1,Unnamed: 0,company,bbb_company_id,phone,user_rating_out_of_5,customer_review_1_date,customer_review_1,business_review_resposne_1_date,buisness_review_resposne_1
0,0,Carvana LLC,1126-1000037076,8003334554,1,07/17/2024,I was delivered a car with broken air bags tha...,no response,No response


In [6]:
# train the model
blank_nlp = spacy.blank("en")
model_training = train_and_evaluate_model(blank_nlp, training_dataset)

first round of training

 Training Results: 

 Classification Report for Training Step Data:
              precision    recall  f1-score   support

    negative       0.91      0.76      0.83      5889
     neutral       0.67      0.51      0.58      4965
    positive       0.68      0.95      0.79      5090

    accuracy                           0.75     15944
   macro avg       0.75      0.74      0.73     15944
weighted avg       0.76      0.75      0.74     15944

second round of training

 Validation Results

 Classification Report for validation Step Data:
              precision    recall  f1-score   support

    negative       0.95      0.89      0.92      1328
     neutral       0.82      0.89      0.85      1037
    positive       0.93      0.92      0.93      1052

    accuracy                           0.90      3417
   macro avg       0.90      0.90      0.90      3417
weighted avg       0.91      0.90      0.90      3417

third round of training

 Test Results

 Classifi

### Distrubution of negative and neutral is too low, will need to go back to yelp json file to add more

In [7]:
# dataframe input, dataframe output
def predict_sentiment_df(joblib_file_path , dataframe):
    if not isinstance(dataframe, pd.DataFrame):
        raise TypeError("train_data must be a pandas DataFrame")
    if not isinstance(dataframe, pd.DataFrame):
        raise TypeError("train_data must be a pandas DataFrame")
    
    try: 
        trained_model = joblib.load(joblib_file_path)
    except Exception as e:
        raise ValueError(f'Error loading joblib file please make sure you have the correct filepath')
    
    # Check if required columns exist
    if 'text' not in dataframe.columns:
        raise ValueError("train_data must have column labels 'text'")
    
    results = []
    texts = dataframe['text']
    docs = list(trained_model.pipe(texts))
    for docs in docs:
        scores = docs.cats
        positive, negative, neutral = scores.values()
        predicted_class = max(scores, key=scores.get)
        results.append({
                'text': docs.text,
                'predicted_class': predicted_class,
                'positive': positive,
                'negative': negative,
                'neutral': neutral
        })
    results_df = pd.DataFrame(results)
    return results_df

In [8]:
trained_model = joblib.load("sentiment_model.joblib")

In [9]:
reviews_for_analysis = pd.DataFrame(carvana_reviews['customer_review_1'])
reviews_for_analysis = reviews_for_analysis.rename(columns={'customer_review_1':'text'})
reviews_for_analysis.head(1)

Unnamed: 0,text
0,I was delivered a car with broken air bags tha...


In [10]:
file_path = "sentiment_model.joblib"
reviews_sentiment_analysis = predict_sentiment_df(file_path, reviews_for_analysis)

In [11]:
reviews_sentiment_analysis

Unnamed: 0,text,predicted_class,positive,negative,neutral
0,I was delivered a car with broken air bags tha...,negative,0.000005,8.617942e-01,0.138201
1,"Easy to buy from, and thats about it.Initial p...",negative,0.004141,6.025425e-01,0.393317
2,Purchased my vehicle with a large down payment...,neutral,0.007864,4.954888e-01,0.496647
3,I give carvana a big fat ZERO they are the wor...,negative,0.000430,9.528344e-01,0.046735
4,Just wanted to share my experience with a rece...,negative,0.000060,9.686783e-01,0.031262
...,...,...,...,...,...
2664,I love the wide vehicle selections on the site...,positive,0.998873,2.020949e-07,0.001127
2665,Service was great staff very friendly process ...,positive,0.999938,3.551396e-14,0.000062
2666,So the acutal purchase experience was much bet...,positive,0.893885,1.063275e-05,0.106104
2667,Saw all the positive reviews on Carvana's webs...,negative,0.000003,9.894114e-01,0.010586


In [12]:
reviews_with_date = carvana_reviews[['customer_review_1_date', 'customer_review_1']]
reviews_with_date = reviews_with_date.rename(columns={'customer_review_1':'text'})
# Include data
reviews_sentiment = pd.merge(reviews_sentiment_analysis, reviews_with_date, how='left', on='text')
reviews_sentiment.head(1)

Unnamed: 0,text,predicted_class,positive,negative,neutral,customer_review_1_date
0,I was delivered a car with broken air bags tha...,negative,5e-06,0.861794,0.138201,07/17/2024


In [14]:
prediction_classes_date_count = reviews_sentiment.copy()
prediction_classes_date_count = prediction_classes_date_count.groupby(['customer_review_1_date','predicted_class']).size().reset_index(name='count')
prediction_classes_date_count = prediction_classes_date_count.groupby(['customer_review_1_date','predicted_class'])['count'].sum().reset_index()
prediction_classes_date_count = prediction_classes_date_count.rename(columns={'customer_review_1_date':'date'})
prediction_classes_date_count


Unnamed: 0,date,predicted_class,count
0,01/01/2022,negative,1
1,01/01/2024,negative,1
2,01/02/2024,negative,2
3,01/03/2022,negative,1
4,01/03/2024,negative,5
...,...,...,...
1270,12/29/2023,neutral,1
1271,12/31/2021,negative,4
1272,12/31/2021,neutral,1
1273,12/31/2022,positive,1


In [16]:

prediction_classes_date_count['date'] = pd.to_datetime(prediction_classes_date_count['date'])
all_date_data = prediction_classes_date_count.copy()

all_date_data['year_month'] = all_date_data['date'].dt.strftime('%m/%Y')
all_date_data = all_date_data.groupby(['year_month', 'predicted_class','date']).sum().reset_index()

all_date_data['sort_date'] = pd.to_datetime(all_date_data['year_month'], format='%m/%Y')
all_date_data = all_date_data.sort_values('sort_date')

all_date_data['year'] = pd.to_datetime(all_date_data['date'])
all_date_data = all_date_data.groupby([all_date_data['year'].dt.year, 'predicted_class','date','year_month'])['count'].sum().reset_index()
all_date_data


Unnamed: 0,year,predicted_class,date,year_month,count
0,2021,negative,2021-07-22,07/2021,4
1,2021,negative,2021-07-23,07/2021,2
2,2021,negative,2021-07-24,07/2021,1
3,2021,negative,2021-07-25,07/2021,2
4,2021,negative,2021-07-26,07/2021,8
...,...,...,...,...,...
1270,2024,positive,2024-05-30,05/2024,1
1271,2024,positive,2024-06-04,06/2024,1
1272,2024,positive,2024-06-06,06/2024,1
1273,2024,positive,2024-06-09,06/2024,1


In [17]:
yearly_data = all_date_data[['year','predicted_class','count','date']]

yearly_data

Unnamed: 0,year,predicted_class,count,date
0,2021,negative,4,2021-07-22
1,2021,negative,2,2021-07-23
2,2021,negative,1,2021-07-24
3,2021,negative,2,2021-07-25
4,2021,negative,8,2021-07-26
...,...,...,...,...
1270,2024,positive,1,2024-05-30
1271,2024,positive,1,2024-06-04
1272,2024,positive,1,2024-06-06
1273,2024,positive,1,2024-06-09


In [19]:
year_2022 = all_date_data[all_date_data['date'].dt.year == 2022]

year_2022 

Unnamed: 0,year,predicted_class,date,year_month,count
322,2022,negative,2022-01-01,01/2022,1
323,2022,negative,2022-01-03,01/2022,1
324,2022,negative,2022-01-04,01/2022,2
325,2022,negative,2022-01-05,01/2022,2
326,2022,negative,2022-01-06,01/2022,4
...,...,...,...,...,...
675,2022,positive,2022-10-24,10/2022,1
676,2022,positive,2022-10-26,10/2022,1
677,2022,positive,2022-11-28,11/2022,1
678,2022,positive,2022-12-12,12/2022,1


In [None]:
fig = px.line(year_2022, x='date', y='count', color='predicted_class',
                        title='Yearly Time Series of Predicted Classes',
                        labels={'date':'Date', 'count':'Count', 'predicted_class':'Predicted Class'},
                        line_shape='linear',
                        render_mode='svg'
                        )
fig.update_layout(
    plot_bgcolor='rgba(240,240,240,1)',
    paper_bgcolor='rgba(0,0,0,0)',
    xaxis_title ='Date',
    yaxis_title ='Count',
    legend_title='Predicted Class',
    font=dict(size=12),
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False),
    hovermode ='x unified'
)

fig.show()  

In [None]:
fig = px.line(monthly_data, x='date', y='count', color='predicted_class',
                title='Time series of Predicted Classes',
                labels={'date':'Date', 'count':'Count', 'predicted_class':'Predicted Class'},
                line_shape='linear',
                render_mode='svg'
                )

fig.update_layout(
    plot_bgcolor='rgba(0,0,0,0)',
    xaxis_title ='Date',
    yaxis_title ='Count',
    legend_title='Predicted Class',
    font=dict(size=12),
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False),
    hovermode ='x unified'
)

fig.show()


In [20]:
predict_totals = prediction_classes_date_count.groupby(['predicted_class']).size().reset_index(name='total').reset_index(drop=True)
predict_totals

Unnamed: 0,predicted_class,total
0,negative,846
1,neutral,284
2,positive,145


In [None]:
category = predict_totals['predicted_class'][0]
total = predict_totals['total'][0]

print(f"predicted Class: {category}")
print(f'total: {total}')

predicted Class: negative
total: 863


In [None]:
# multi word
def keyphrase(dataframe: pd.Series) -> pd.DataFrame:
    '''
    Generates keywords of multiple lengths from given text input
        Args:
            column (pd.Series): DataFrame column containing text
        Returns:
            pd.DataFrame: DataFrame of keyword phrases and their importance

        print("Keywords, Phrases")
    '''
    if not isinstance(dataframe, pd.DataFrame):
        raise TypeError("train_data must be a pandas DataFrame")
    
    # Check if required columns exist
    if 'text' not in dataframe.columns:
        raise ValueError("train_data must have column labels 'text'")

    kw_model = KeyBERT()
    # Convert all values to strings and replace NaN with empty string
    dataframe['text'] = dataframe['text'].astype(str).replace('nan', '')

    vectorizer = KeyphraseCountVectorizer(
        pos_pattern='<N.*>+<N.*>|<N.*>|<J.*>+<N.*>+',
        stop_words='english'
    )


    try:
        kw = kw_model.extract_keywords(
                                    docs= dataframe['text'].to_list(), 
                                    vectorizer= vectorizer)
    except ValueError as e:
        print(f"Error occurred: {e}")
        return pd.DataFrame()
    
    

    keyword_phrases_list = []
    for kw_tuple in kw:
        try:
            if kw_tuple:  # Check if kw_tuple is not empty
                keyword_list, keyword_importance = map(list, zip(*kw_tuple))
                keyword_phrases_list.append({'keyword': keyword_list,
                                            "importance": keyword_importance})
            else:
                keyword_phrases_list.append({'keyword': [], "importance": []})
        except Exception as e:
            print(f"Error processing keyword tuple: {e}")
            keyword_phrases_list.append({'keyword': [], "importance": []})

    keyword_phrases = pd.DataFrame(keyword_phrases_list)

    if not keyword_phrases.empty:
        keyword_phrases['keyword'] = keyword_phrases['keyword'].apply(lambda x: ', '.join(x) if x else '')
        keyword_phrases['total_importance'] = keyword_phrases['importance'].apply(lambda x: sum(x) if x else 0).round(2)
        keyword_phrases['importance'] = keyword_phrases['importance'].apply(lambda x: '-'.join(map(str, x)) if x else '')

    return keyword_phrases

In [None]:
# keyphrases
carvana_review_text = carvana_reviews['customer_review_1']
carvana_review_text = pd.DataFrame(carvana_review_text)
carvana_review_text = carvana_review_text.rename(columns={'customer_review_1':'text'})
carvana_review_text

Unnamed: 0,text
0,I was delivered a car with broken air bags tha...
1,"Easy to buy from, and thats about it.Initial p..."
2,Purchased my vehicle with a large down payment...
3,I give carvana a big fat ZERO they are the wor...
4,Just wanted to share my experience with a rece...
...,...
2664,I love the wide vehicle selections on the site...
2665,Service was great staff very friendly process ...
2666,So the acutal purchase experience was much bet...
2667,Saw all the positive reviews on Carvana's webs...


In [None]:
review_keyphrases = keyphrase(carvana_review_text)
review_keyphrases

Unnamed: 0,keyword,importance,total_importance
0,"broken air bags, car seller, lawyer, corrupt b...",0.4594-0.4097-0.4001-0.3091-0.2842,1.86
1,"warranty mileage, warranty provider, mileage d...",0.6755-0.5409-0.506-0.4987-0.4912,2.71
2,"appointment, call, different car, phone call, ...",0.3416-0.281-0.2753-0.2713-0.2439,1.41
3,"business carvana, real customer reviews, carva...",0.4572-0.4091-0.4066-0.4032-0.3952,2.07
4,"transaction, financing, car purchase, purchase...",0.4496-0.4427-0.4359-0.3848-0.3705,2.08
...,...,...,...
2664,"wide vehicle selections, carvana, vehicle, car...",0.6643-0.3804-0.3211-0.3178-0.286,1.97
2665,"friendly process, great staff, service, staff,...",0.5852-0.5638-0.508-0.4715-0.3386,2.47
2666,"acutal purchase experience, purchase carvana, ...",0.6223-0.5405-0.4688-0.4678-0.3552,2.45
2667,"vehicle, carvana, transportation issue, delive...",0.4186-0.4059-0.3948-0.3602-0.3589,1.94


In [None]:
review_keyphrases_top_50 = review_keyphrases.head(50)
review_keyphrases_top_50 = review_keyphrases_top_50.drop(columns=['importance', 'total_importance'])
review_keyphrases_top_50

Unnamed: 0,keyword
0,"broken air bags, car seller, lawyer, corrupt b..."
1,"warranty mileage, warranty provider, mileage d..."
2,"appointment, call, different car, phone call, ..."
3,"business carvana, real customer reviews, carva..."
4,"transaction, financing, car purchase, purchase..."
5,"predatory loans, refinance, refinance process,..."
6,"carvana car, insurance, carvana, subprime loan..."
7,"car carvana, previous car, car, much negative ..."
8,"name change, company, deposit, marriage, insane"
9,"arbitration, carvana, exchange vehicle, carfax..."


In [None]:
def overall_keywords_and_importance(dataframe):
    if not isinstance(dataframe, pd.DataFrame):
        raise TypeError("train_data must be a pandas DataFrame")
    
    # Check if required columns exist
    if 'text' not in dataframe.columns:
        raise ValueError("train_data must have column labels 'text'")

    try:
        # In case some items are not strings 
        dataframe['text'] = dataframe['text'].astype(str)
        vectorizer = KeyphraseCountVectorizer()
        list_matrix = vectorizer.fit_transform(dataframe['text'])
        feature_names = vectorizer.get_feature_names_out()
        importance = list_matrix.sum(axis=0).A1
        
        keyword_list = []
        for keyword, importance  in zip(feature_names, importance):
            keyword_list.append({
                'keyword': keyword,
                'importance': importance})
        
        unsorted_keywords = pd.DataFrame(keyword_list)
        keywords = unsorted_keywords.sort_values('importance', ascending=False)
    
    except Exception as e:
            keyword_list.append([])
            keywords = pd.DataFrame(keyword_list)
    return keywords 

In [None]:
import re

def remove_all_non_letters(dataframe):
    if not isinstance(dataframe, pd.DataFrame):
        raise TypeError("train_data must be a pandas DataFrame")
    
    # Check if required columns exist
    if 'keyword' not in dataframe.columns:
        raise ValueError("train_data must have column labels keyword")
    

    common_stop_words = [
    "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at",
    "be", "because", "been", "before", "being", "below", "between", "both", "but", "by",
    "could",
    "did", "do", "does", "doing", "down", "during",
    "each",
    "few", "for", "from", "further",
    "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", 
    "him", "himself", "his", "how", "how's",
    "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself",
    "let's",
    "me", "more", "most", "my", "myself",
    "no", "nor", "not",
    "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own",
    "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such",
    "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", 
    "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too",
    "under", "until", "up",
    "very",
    "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", 
    "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would",
    "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", 'get', 'back','get'
    ]
    
    # Identify the columns with the non letters
    cleaned_df = dataframe.copy()

    # cleaning of unnecessary string data 
    cleaned_df['keyword'] = cleaned_df['keyword'].astype(str)
    cleaned_df['keyword']  = cleaned_df['keyword'].str.replace(r'[^a-zA-Z\s]', '', regex=True)
    cleaned_df['keyword'] = cleaned_df['keyword'].str.replace(r'\b[a-zA-Z]\b','',regex=True)
    cleaned_df['keyword']  = cleaned_df['keyword'].str.replace(r'\s+', '', regex=True).str.strip()

    for i, row in cleaned_df.iterrows():
        if row['keyword'] == common_stop_words:
            cleaned_df.at[i, 'keyword'] = ' ' 
    
    # drop empty rows
    cleaned_df = cleaned_df[cleaned_df['keyword'] != '']

    #print results 
    original_columns = len(dataframe)
    cleaned_columns = len(cleaned_df)

    print(f'Original columns: {original_columns}')
    print(f'Columns after cleaning: {cleaned_columns}')
    return cleaned_df



In [None]:
# keywords
keywords_review = overall_keywords_and_importance(carvana_review_text)

In [None]:
keywords_review = remove_all_non_letters(keywords_review)
keywords_review.head(5)

Original columns: 14925
Columns after cleaning: 14751


Unnamed: 0,keyword,importance
8774,car,6259
12571,carvana,3716
5728,vehicle,2381
235,would,2017
400,get,1722


In [None]:
from wordcloud import WordCloud

def wordcloud(dataframe, width=2000, height=1200):
    def red_color_func(word, font_size, position, orientation, frequency, max_frequency, **kwargs):
        # Normalize the frequency
        normalized_frequency = frequency / max_frequency
        
        # Calculate the red intensity based on frequency
        red_intensity = int(255 * normalized_frequency)
        
        return f"rgb({red_intensity}, 0, 0)"

    # Assuming you have a dictionary of word frequencies
    word_freq = dict(zip(dataframe['keyword'], dataframe['importance']))
    # Find the maximum frequency
    max_freq = max(word_freq.values())

    # Instantiate Word Cloud
    wc = WordCloud(width=width, 
                    height=height,
                    min_font_size=10,
                    background_color='white',
                    color_func=lambda *args, **kwargs: red_color_func(*args, **kwargs, 
                                                                frequency=word_freq[args[0]], 
                                                                max_frequency=max_freq))
    # Generate a word cloud
    keyword_wc = wc.generate(dataframe['keyword'].str.cat(sep=" "))
    fig = px.imshow(
        keyword_wc,
        binary_string=True,
        aspect='auto'
    )

    # Update the layout to remove axes and gridlines
    fig.update_layout(
        xaxis=dict(visible=False, showticklabels=False),
        yaxis=dict(visible=False, showticklabels=False),
        margin=dict(l=0, r=0, t=0, b=0),
        width=width, 
        height=height,
    )

    # Remove hover effects
    fig.update_traces(hoverinfo='none', hovertemplate=None)
    return fig

In [None]:
keyphrases_review = pd.read_csv('reviews_keyphrases.csv')
keywords_review = pd.read_csv('keywords_reviews.csv')

In [None]:
top_500_review_words = keywords_review.head(500)
fig = wordcloud(top_500_review_words)
fig.show()

In [None]:
carvana_reviews.head(1)

Unnamed: 0.1,Unnamed: 0,company,bbb_company_id,phone,user_rating_out_of_5,customer_review_1_date,customer_review_1,business_review_resposne_1_date,buisness_review_resposne_1
0,0,Carvana LLC,1126-1000037076,8003334554,1,07/17/2024,I was delivered a car with broken air bags tha...,no response,No response


In [None]:
carvana_reviews['customer_review_1_date'] = pd.to_datetime(carvana_reviews['customer_review_1_date'])

score_date_count = carvana_reviews.groupby(['customer_review_1_date', 'user_rating_out_of_5']).size().reset_index(name='count')
score_date_count.head(20)

In [None]:
#score_date_count.to_feather('review_score_out_of_5.feather')
#score_date_count.to_feather('2nd_review_score_out_of_5.feather')

In [None]:

reviews_count = carvana_reviews.groupby(['customer_review_1_date','customer_review_1']).size().reset_index(name='count')
reviews_count = reviews_count.drop(columns=['customer_review_1'])
reviews_count = reviews_count.groupby('customer_review_1_date')['count'].sum().reset_index()
reviews_count = reviews_count.rename(columns={'customer_review_1_date':'date'})
reviews_count

Unnamed: 0,date,count
0,01/01/2022,1
1,01/01/2024,1
2,01/02/2024,2
3,01/03/2022,1
4,01/03/2024,6
...,...,...
889,12/29/2022,2
890,12/29/2023,2
891,12/31/2021,5
892,12/31/2022,1


In [None]:
reviews_count['date'] = pd.to_datetime(reviews_count['date'])
all_date_data_review = reviews_count.copy()

all_date_data_review['year_month'] = all_date_data_review['date'].dt.strftime('%m/%Y')
all_date_data_review = all_date_data_review.groupby(['year_month','date']).sum().reset_index()

all_date_data_review['sort_date'] = pd.to_datetime(all_date_data_review['year_month'], format='%m/%Y')
all_date_data_review = all_date_data_review.sort_values('sort_date')

all_date_data_review['year'] = pd.to_datetime(all_date_data_review['date'])
all_date_data_review = all_date_data_review.groupby([all_date_data_review['year'].dt.year,'date','year_month'])['count'].sum().reset_index()
all_date_data_review

Unnamed: 0,year,date,year_month,count
0,2021,2021-07-22,07/2021,8
1,2021,2021-07-23,07/2021,6
2,2021,2021-07-24,07/2021,6
3,2021,2021-07-25,07/2021,4
4,2021,2021-07-26,07/2021,11
...,...,...,...,...
889,2024,2024-07-11,07/2024,3
890,2024,2024-07-12,07/2024,2
891,2024,2024-07-15,07/2024,3
892,2024,2024-07-16,07/2024,1


In [None]:
all_date_data_review.to_feather('reviews_count.feather')

In [None]:
overall_rev = all_date_data_review[['year','count']]
overall_rev = overall_rev.groupby(['year']).sum().reset_index()
overall_rev

Unnamed: 0,year,count
0,2021,933
1,2022,611
2,2023,673
3,2024,452


In [None]:
fig = px.line(overall_rev, x='year', y='count',
                title=f'Time Series for the year',
                labels={'date':'Date', 'count':'Count'},
                line_shape='linear',
                render_mode='svg'
                )
fig.update_layout(
    plot_bgcolor='rgba(240,240,240,1)',
    paper_bgcolor='rgba(0,0,0,0)',
    xaxis_title ='Date',
    yaxis_title ='Count',
    font=dict(size=12),
    xaxis=dict(showgrid=False,dtick="Y1",tickformat="%Y"),
        yaxis=dict(showgrid=False),
    hovermode ='x unified'
)

In [None]:
all_date_data_review['year'] = pd.to_datetime(all_date_data_review['year'].astype(str))
monthly_rev = all_date_data_review[all_date_data_review['year'].dt.year == 2024]

In [None]:
fig = px.line(monthly_rev, x='date', y='count',
                        title='Yearly Time Series of Predicted Classes',
                        labels={'date':'Date', 'count':'Count', 'predicted_class':'Predicted Class'},
                        line_shape='linear',
                        render_mode='svg'
                        )
fig.update_layout(
    plot_bgcolor='rgba(240,240,240,1)',
    paper_bgcolor='rgba(0,0,0,0)',
    xaxis_title ='Date',
    yaxis_title ='Count',
    legend_title='Predicted Class',
    font=dict(size=12),
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False),
    hovermode ='x unified'
)


https://medium.com/@yennhi95zz/a-guide-to-time-series-models-in-machine-learning-usage-pros-and-cons-ac590a75e8b3

We will be making predictions with an Autoregressive Integrated Moving Average (ARIMA)

# Complaints Analysis
### Methods for analysis
- Keywords 
- Time series

In [None]:
carvana_complaints_text = carvana_complaints['complaint']
carvana_complaints_text = pd.DataFrame(carvana_complaints_text)
carvana_complaints_text = carvana_complaints_text.rename(columns={'complaint':'text'})
carvana_complaints_text

Unnamed: 0,text
0,I purchased a 2023 ***** CX-5 online on July 1...
1,Carvana brought me the 2017 **** EXPLORER on J...
2,Im writing to share an experience with Carvana...
3,On July 2nd I attempted to purchase a vehicle ...
4,I bought a car from Carvana 89 days ago. I st...
...,...
4314,I purchased a vehicle from Carvana and had it ...
4315,I attempted to purchase a car from Carvana two...
4316,Bought a car from Carvana 3/8/2021. I paid for...
4317,Carvana sold me a car that was already sold. I...


In [None]:
# keyphrases
keyphrases_complaint = keyphrase(carvana_complaints_text)

2024-07-22 20:56:22,169 - KeyphraseVectorizer - INFO - It looks like you do not have downloaded a list of stopwords yet. It is attempted to download the stopwords now.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kahsw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
2024-07-22 20:56:22,687 - KeyphraseVectorizer - INFO - It looks like the selected spaCy pipeline is not downloaded yet. It is attempted to download the spaCy pipeline now.
2024-07-22 20:56:22,687 - KeyphraseVectorizer - INFO - It looks like the selected spaCy pipeline is not downloaded yet. It is attempted to download the spaCy pipeline now.


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
keyphrases_complaint_df = keyphrases_complaint.copy()
keyphrases_complaint_df  = keyphrases_complaint_df.sort_values('total_importance', ascending=False)
keyphrases_complaint_df  = keyphrases_complaint_df .head(50)
keyphrases_complaint_df  = keyphrases_complaint_df.drop(columns=['importance','total_importance'])
keyphrases_complaint_df  = keyphrases_complaint_df .rename(columns={'keyword':'keyphrase'})
keyphrases_complaint_df 

Unnamed: 0,keyphrase
2230,"different temporary license plates, temporary ..."
1156,"5)deceptive business practice, 3)deceptive bus..."
1113,"consumer credit transaction, consumer credit, ..."
1867,"title lien, lien title, lien date, incorrect l..."
3842,"purchase carvana, car carvana, carvana vehicle..."
856,"title lien, lien title, carvana lien, lien rel..."
1916,"financed car december,02,2022, financed car, f..."
3838,"carvana plates, contact carvana, car carvana, ..."
1417,"carvanas warranty, carvanas warranty office, c..."
2055,"4wd activation, 4wd, toyota 4runner, 4runner, 2wd"


In [None]:
# keywords
keywords_complaint = overall_keywords_and_importance(carvana_complaints_text)

In [None]:
cleaned_keywords_complaints = remove_all_non_letters(keywords_complaint)
cleaned_keywords_complaints.head(10)

Original columns: 24417
Columns after cleaning: 23945


Unnamed: 0,keyword,importance
5664,car,11462
12504,carvana,10185
15032,vehicle,7490
1873,get,2975
10233,back,2618
6447,purchased,2430
1233,time,2147
12512,registration,2111
5284,delivery,2077
14988,days,2055


In [None]:
# top 100 words 
top_500_keywords_complaints = cleaned_keywords_complaints.head(500)
top_500_keywords_complaints.head(1)

Unnamed: 0,keyword,importance
5664,car,11462


In [None]:
# Word map
fig = wordcloud(top_500_keywords_complaints)
fig.show()

In [None]:
# complaint counts by date 
carvana_complaints['initial_complaint_date'] = pd.to_datetime(carvana_complaints['initial_complaint_date'], format='%m/%d/%Y', errors='coerce')
# clean the date for this format 
num_complaints = carvana_complaints.groupby(['initial_complaint_date','complaint','complaint_type']).size().reset_index(name='count')
num_complaints = num_complaints.drop(columns='complaint')
num_complaints = num_complaints.groupby(['initial_complaint_date','complaint_type'])['count'].sum().reset_index()
num_complaints = num_complaints.rename(columns={'initial_complaint_date':'date'})
num_complaints

Unnamed: 0,date,complaint_type,count
0,2021-07-19,Product Issues,2
1,2021-07-19,Sales and Advertising Issues,1
2,2021-07-19,Service or Repair Issues,6
3,2021-07-20,Order Issues,1
4,2021-07-20,Product Issues,1
...,...,...,...
2086,2024-07-06,Sales and Advertising Issues,1
2087,2024-07-06,Service or Repair Issues,1
2088,2024-07-08,Service or Repair Issues,1
2089,2024-07-09,Service or Repair Issues,3


In [None]:
num_complaints['date'] = pd.to_datetime(num_complaints['date'])
all_date_data_complaints = num_complaints.copy()

all_date_data_complaints['year_month'] = all_date_data_complaints ['date'].dt.strftime('%m/%Y')
all_date_data_complaints  = all_date_data_complaints .groupby(['year_month','date','complaint_type']).sum().reset_index()

all_date_data_complaints ['sort_date'] = pd.to_datetime(all_date_data_complaints ['year_month'], format='%m/%Y')
all_date_data_complaints  = all_date_data_complaints .sort_values('sort_date')

all_date_data_complaints['year'] = pd.to_datetime(all_date_data_complaints['date'])
all_date_data_complaints = all_date_data_complaints .groupby([all_date_data_complaints ['year'].dt.year,'date','year_month','complaint_type'])['count'].sum().reset_index()
all_date_data_complaints 

Unnamed: 0,year,date,year_month,complaint_type,count
0,2021,2021-07-19,07/2021,Product Issues,2
1,2021,2021-07-19,07/2021,Sales and Advertising Issues,1
2,2021,2021-07-19,07/2021,Service or Repair Issues,6
3,2021,2021-07-20,07/2021,Order Issues,1
4,2021,2021-07-20,07/2021,Product Issues,1
...,...,...,...,...,...
2086,2024,2024-07-06,07/2024,Sales and Advertising Issues,1
2087,2024,2024-07-06,07/2024,Service or Repair Issues,1
2088,2024,2024-07-08,07/2024,Service or Repair Issues,1
2089,2024,2024-07-09,07/2024,Service or Repair Issues,3


In [None]:
only_yr = all_date_data_complaints[['year','count']]
only_yr = only_yr.groupby(['year']).sum().reset_index()
only_yr

Unnamed: 0,year,count
0,2021,835
1,2022,1506
2,2023,1226
3,2024,749


In [None]:
only_serv = all_date_data_complaints[['year','count','complaint_type']]
only_serv = only_serv.groupby(['year','count','complaint_type']).sum().reset_index()
only_serv

Unnamed: 0,year,count,complaint_type
0,2021,1,Billing Issues
1,2021,1,Customer Service Issues
2,2021,1,Delivery Issues
3,2021,1,Order Issues
4,2021,1,Product Issues
...,...,...,...
85,2024,4,Service or Repair Issues
86,2024,5,Service or Repair Issues
87,2024,6,Service or Repair Issues
88,2024,7,Service or Repair Issues


In [None]:
selected_year = 2021
year_data = only_serv[only_serv['year'] == selected_year]
year_data = year_data.groupby('complaint_type')['count'].sum().reset_index()
fig = px.bar(year_data, x='complaint_type', y='count', color='complaint_type',
    title='Complaints by Service Type Over Time ',
    labels={'date':'Date', 'count':'Count', 'complaint_type':'Complaint Type'},
    )
fig.update_layout(
    plot_bgcolor='rgba(240,240,240,1)',
    paper_bgcolor='rgba(0,0,0,0)',
    xaxis_title ='Complaint Type',
    yaxis_title ='Count',
    font=dict(size=12),
    xaxis=dict(showgrid=False,dtick="Y1",tickformat="%Y"),
    yaxis=dict(showgrid=False),
    hovermode ='x unified'
)

In [None]:
#only_serv.to_feather('complaints_count.feather')

In [None]:
only_serv_feather = pd.read_feather('complaints_count.feather')
only_serv_feather 

Unnamed: 0,year,count,complaint_type
0,2021,1,Billing Issues
1,2021,1,Customer Service Issues
2,2021,1,Delivery Issues
3,2021,1,Order Issues
4,2021,1,Product Issues
...,...,...,...
85,2024,4,Service or Repair Issues
86,2024,5,Service or Repair Issues
87,2024,6,Service or Repair Issues
88,2024,7,Service or Repair Issues
