In [2]:
import pandas as pd
from io import StringIO
import os
from tqdm import tqdm

In [69]:
import pandas as pd

def fix_and_read_csv(file_path):
    # Read the file content
    with open(file_path, 'r', encoding='utf-8') as file:
        # Take the first line as header
        header = file.readline().strip()
        content = file.read()
    
    # Fix the line breaks within fields
    fixed_content = header + '\n'
    in_review = False
    for line in content.split('\n'):
        if line.startswith(tuple('0123456789')) and ',' in line:
            # This is a new entry
            if len(fixed_content) > len(header) + 1:  # Exclude the first line
                fixed_content += '\n'
            in_review = True
            fixed_content += line
        elif in_review:
            # This is a continuation of the previous line
            fixed_content += ' ' + line.strip()
    
    # Create DataFrame from fixed content
    df = pd.read_csv(StringIO(fixed_content))
    return df

In [70]:
def merge_reviews(file_list):
    all_reviews = []
    for file_path in tqdm(file_list):
        # Extract car make from the file name
        base_name = os.path.basename(file_path)
        parts = base_name.split('_')
        car_make = parts[-1].replace('.csv', '')
        # Read and fix CSV
        df = fix_and_read_csv(file_path)
        
        # Add the 'Car_Make' column
        df['Car_Make'] = car_make
        
        # Append to list
        all_reviews.append(df)
    
    # Concatenate all DataFrames into a single one
    merged_df = pd.concat(all_reviews, ignore_index=True)
    return merged_df

In [71]:
# Specify the directory
directory = '/Users/kai/Documents/Study/S&DS625 Case/Carismatic/data/car_reviews'
file_list = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.csv')]
print(file_list)

['/Users/kai/Documents/Study/S&DS625 Case/Carismatic/data/car_reviews/Scrapped_Car_Reviews_Volkswagen.csv', '/Users/kai/Documents/Study/S&DS625 Case/Carismatic/data/car_reviews/Scraped_Car_Review_lamborghini.csv', '/Users/kai/Documents/Study/S&DS625 Case/Carismatic/data/car_reviews/Scraped_Car_Review_lotus.csv', '/Users/kai/Documents/Study/S&DS625 Case/Carismatic/data/car_reviews/Scraped_Car_Review_isuzu.csv', '/Users/kai/Documents/Study/S&DS625 Case/Carismatic/data/car_reviews/Scraped_Car_Review_ferrari.csv', '/Users/kai/Documents/Study/S&DS625 Case/Carismatic/data/car_reviews/Scrapped_Car_Reviews_GMC.csv', '/Users/kai/Documents/Study/S&DS625 Case/Carismatic/data/car_reviews/Scraped_Car_Review_land-rover.csv', '/Users/kai/Documents/Study/S&DS625 Case/Carismatic/data/car_reviews/Scraped_Car_Review_lincoln.csv', '/Users/kai/Documents/Study/S&DS625 Case/Carismatic/data/car_reviews/Scrapped_Car_Reviews_BMW.csv', '/Users/kai/Documents/Study/S&DS625 Case/Carismatic/data/car_reviews/Scraped_

In [72]:
# Process the files
merged_df = merge_reviews(file_list)

100%|██████████| 50/50 [00:02<00:00, 21.48it/s]


In [154]:
# drop first column
merged_df = merged_df.drop(columns=['Unnamed: 0'])

In [155]:
merged_df.to_csv('review_data.csv')

In [171]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

- `df_a` is car dataset 
- `df_b` is review dataset

In [172]:
df_a = pd.read_csv('data.csv')
df_b = merged_df

Extract Maker, Model and Year from review dataset and fuzzy match with car dataset

In [173]:
from fuzzywuzzy import fuzz
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

def find_match(row, df_a):
    try:
        # Create a string to match from df_a columns
        df_a['full_name'] = df_a['Genmodel'] + ' ' + df_a['Year'].astype(str)
        
        # Extract components from Vehicle_Title
        title_parts = row['Vehicle_Title'].split()
        year = title_parts[0]
        maker = title_parts[1]
        model = ' '.join(title_parts[2:4])  # Usually brand and model
    
        # Calculate similarity scores
        maker_scores = df_a['Maker'].apply(lambda x: fuzz.ratio(x.lower(), maker.lower()))
        model_scores = df_a['Genmodel'].apply(lambda x: fuzz.ratio(x.lower(), model.lower()))
        year_scores = df_a['Year'].apply(lambda x: fuzz.ratio(str(x).lower(), year.lower()))
    
        # Combine scores with different weights
        total_scores = maker_scores * 0.45 + model_scores * 0.45 + year_scores * 0.1
    
        # Find the best match
        best_score = max(total_scores)
        best_match_idx = total_scores.idxmax()
    
        # Logging for evaluation
        # print(best_score, maker, model, year, df_a.loc[best_match_idx, 'full_name'])
    
        # Return the index of the best match if it's a good match
        return best_match_idx if best_score > 70 else None
    except Exception as e:
        # Log the error if needed and return None to signify a non-match
        # print(f"Skipping due to error: {e} for vehicle title: {row['Vehicle_Title']}")
        return None

# Apply the modified matching function
df_b['match_idx'] = df_b.progress_apply(lambda row: find_match(row, df_a), axis=1)

# Filter out non-matches
filtered_df_b = df_b[df_b['match_idx'].notna()]

# Merge the dataframes
result = filtered_df_b.merge(df_a, left_on='match_idx', right_index=True)

100%|██████████| 227080/227080 [7:15:11<00:00,  8.70it/s]   


In [174]:
# Filter Year, Genmodel_ID and Review and Rating
result = result[['Year', 'Genmodel_ID', 'Review', 'Rating']]
result.head()

Unnamed: 0,Year,Genmodel_ID,Review,Rating
0,2007,95_3,I've had my Beetle Convertible for over 4.5 y...,4.5
1,2007,95_3,We bought the car new in 2007 and are general...,4.375
2,2007,95_3,I adore my New Beetle. Even though I'm a male...,4.375
3,2007,95_3,My wife chose this car to replace a Sebring c...,4.375
4,2007,95_3,4 of us carpool 1 way 30 min. Backseat ok fo...,4.75


In [175]:
# result.to_csv("final_data_raw.csv")

In [3]:
result = pd.read_csv("final_data_raw.csv", index = False)

In [4]:
result = result[['Year', 'Genmodel_ID', 'Review', 'Rating']]

In [5]:
# Use a pipeline as a high-level helper
from transformers import pipeline, AutoTokenizer
pipe = pipeline("text-classification", model="nlptown/bert-base-multilingual-uncased-sentiment")
# Initialize the tokenizer for your model to tokenize text
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")  # Replace with the actual model name
max_length = tokenizer.model_max_length  # Typically 512 for BERT models

print(max_length)

  Referenced from: <253997FD-685F-34A9-B3D7-4AF6DAE96CDF> /Users/kai/miniconda3/envs/car/lib/python3.11/site-packages/torchvision/image.so
  warn(
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


512


In [6]:
from tqdm import tqdm
from transformers import AutoTokenizer

tqdm.pandas()

# Map from star labels to numerical values
star_to_numeric = {
    '1 star': 1,
    '2 stars': 2,
    '3 stars': 3,
    '4 stars': 4,
    '5 stars': 5
}

# Function to safely convert sentiment label to numeric
def safe_convert(review):
    # Check if the review is a string
    if not isinstance(review, str):
        return None  # Or some default/special value

    # Tokenize and truncate the review if necessary
    tokens = tokenizer.tokenize(review)
    if len(tokens) > max_length:
        review = tokenizer.convert_tokens_to_string(tokens[:max_length - 2])

    try:
        # Process the review and convert the label to numeric
        sentiment_label = pipe(review)[0]['label']
        return star_to_numeric.get(sentiment_label, None)  # None or some default for unknown labels
    except Exception as e:
        print(f"Error processing review: {e}")
        return None  # Or handle the error as required

# Apply the function to the reviews
result['Sentiment Rating'] = result['Review'].progress_apply(safe_convert)

  0%|          | 508/117630 [00:38<2:16:29, 14.30it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (530 > 512). Running this sequence through the model will result in indexing errors
  5%|▌         | 6313/117630 [08:09<2:17:19, 13.51it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors


Error processing review: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


  9%|▉         | 10776/117630 [13:20<2:08:23, 13.87it/s]

Error processing review: The size of tensor a (513) must match the size of tensor b (512) at non-singleton dimension 1


 22%|██▏       | 26142/117630 [31:33<2:10:54, 11.65it/s]

Error processing review: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


 23%|██▎       | 26701/117630 [32:13<1:43:38, 14.62it/s]

Error processing review: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


 30%|██▉       | 34902/117630 [41:59<1:37:38, 14.12it/s]

Error processing review: The size of tensor a (513) must match the size of tensor b (512) at non-singleton dimension 1


 33%|███▎      | 38735/117630 [46:52<1:48:50, 12.08it/s]

Error processing review: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


 50%|█████     | 59338/117630 [1:12:03<1:01:16, 15.86it/s]

Error processing review: The size of tensor a (513) must match the size of tensor b (512) at non-singleton dimension 1


 56%|█████▌    | 65593/117630 [1:20:15<1:28:48,  9.77it/s]

Error processing review: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


 67%|██████▋   | 78605/117630 [1:35:57<47:04, 13.82it/s]  

Error processing review: The size of tensor a (513) must match the size of tensor b (512) at non-singleton dimension 1


 83%|████████▎ | 97430/117630 [1:58:26<21:28, 15.68it/s]  

Error processing review: The size of tensor a (513) must match the size of tensor b (512) at non-singleton dimension 1


 86%|████████▌ | 101054/117630 [2:02:47<23:07, 11.95it/s]

Error processing review: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


 87%|████████▋ | 102423/117630 [2:04:28<15:23, 16.47it/s]

Error processing review: The size of tensor a (513) must match the size of tensor b (512) at non-singleton dimension 1


100%|██████████| 117630/117630 [2:22:04<00:00, 13.80it/s]


In [7]:
result.to_csv("final_data_2.csv")

In [10]:
# Rename rating as review_rating
result = result.rename(columns={'Rating': 'Review_Rating'})
result = result[['Year', 'Genmodel_ID', 'Review_Rating', 'Sentiment Rating']]

In [12]:
result.to_csv("processed_review_data.csv", index=False)