# Data preprocessing

In [1]:
!pip install transformers[torch]
!pip install gensim
!pip install bertopic

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [2]:
import pandas as pd
import numpy as np

from google.colab import data_table, drive

data_table.enable_dataframe_formatter()
drive.mount('/content/drive/')
data_path = "/content/drive/MyDrive/Colab Notebooks/DSC 253 - Adv Data-Driven Text Mining/Challenge/"


Mounted at /content/drive/


In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
# Load the training data
def load_data(path):
  train_df = pd.read_csv(path + 'train.csv')
  test_df = pd.read_csv(path + 'test.csv')
  test_df['label'] = np.nan
  combined_df = pd.concat([train_df, test_df], ignore_index=True)
  print(train_df.shape, test_df.shape, combined_df.shape)
  train_index = np.where(pd.notna(combined_df['label']))[0]
  test_index = np.where(pd.isna(combined_df['label']))[0]
  return combined_df, train_index, test_index

data, train_index, test_index = load_data(data_path)
data.head()


(13144, 62) (10000, 62) (23144, 62)


Unnamed: 0,id,attributes.HappyHour,attributes.Ambience,hours.Tuesday,postal_code,attributes.AgesAllowed,attributes.GoodForDancing,attributes.OutdoorSeating,hours.Saturday,attributes.Corkage,...,attributes.AcceptsInsurance,attributes.RestaurantsDelivery,attributes.DietaryRestrictions,attributes.BusinessAcceptsBitcoin,address,attributes.GoodForKids,attributes.GoodForMeal,hours,label,review
0,0,b'True',"b""{'romantic': False, 'intimate': False, 'clas...",b'15:0-2:0',b'44107',,,b'False',b'11:30-2:0',,...,,b'False',,,b'17800 Detroit Ave',b'False',"b""{'dessert': False, 'latenight': False, 'lunc...","{'Monday': '16:0-2:0', 'Tuesday': '15:0-2:0', ...",american (traditional),"So, we stopped here on our way to the Side Que..."
1,1,,"b""{'romantic': False, 'intimate': False, 'tour...",b'11:0-21:0',b'85042',,,b'True',b'11:0-20:30',,...,,b'False',,b'False',"b'2160 E Baseline Rd, Ste 128'",b'True',"b""{'dessert': False, 'latenight': False, 'lunc...","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",american (new),This is our go-to healthy spot! The food is al...
2,2,,,b'11:0-21:0',b'M4M 3G6',,,,b'11:0-21:0',,...,,,,,b'1000 Gerrard St E',,,"{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",mexican,Food court meal at Gerrard Square. It's been ...
3,3,,"b""{'romantic': False, 'intimate': False, 'clas...",,b'89146',,,b'False',,,...,,b'True',,,b'6700 W Charleston Blvd',b'True',,,mexican,"Located on Rainbow/Charleston, this small fami..."
4,4,,"b""{'romantic': False, 'intimate': False, 'tour...",,b'44133',,,b'False',,,...,,b'False',,,b'5630 Wallings Rd',b'True',"b""{'dessert': False, 'latenight': False, 'lunc...",,chinese,No frills Chinese takeout joint which serves u...


In [5]:
# Calculate the percentage of missing values for each column
missing_data = data.isnull().sum() / len(data) * 100

# Filter out columns with more than 50% missing data
high_missing_data_columns = missing_data[missing_data > 50].index.tolist()

# Remove columns with more than 90% missing values
columns_to_remove = missing_data[missing_data > 50].index
train_data_cleaned = data.drop(columns=columns_to_remove)

# Overview of the data after removal
# train_data_cleaned.info(), train_data_cleaned.head()
print(high_missing_data_columns)
train_data_cleaned.columns


['attributes.HappyHour', 'attributes.AgesAllowed', 'attributes.GoodForDancing', 'attributes.Corkage', 'attributes.RestaurantsTableService', 'attributes.ByAppointmentOnly', 'attributes.Music', 'attributes.BYOB', 'attributes.BYOBCorkage', 'attributes.Open24Hours', 'attributes.DogsAllowed', 'attributes.HairSpecializesIn', 'attributes.CoatCheck', 'attributes.WheelchairAccessible', 'attributes.DriveThru', 'attributes.Smoking', 'attributes.RestaurantsCounterService', 'attributes.BestNights', 'attributes.AcceptsInsurance', 'attributes.DietaryRestrictions', 'attributes.BusinessAcceptsBitcoin']


Index(['id', 'attributes.Ambience', 'hours.Tuesday', 'postal_code',
       'attributes.OutdoorSeating', 'hours.Saturday', 'longitude', 'name',
       'attributes.BusinessAcceptsCreditCards',
       'attributes.RestaurantsReservations', 'hours.Friday',
       'attributes.RestaurantsPriceRange2', 'attributes.WiFi',
       'attributes.NoiseLevel', 'state', 'attributes.Alcohol',
       'attributes.HasTV', 'attributes', 'hours.Wednesday', 'hours.Sunday',
       'attributes.RestaurantsGoodForGroups', 'attributes.BusinessParking',
       'review_count', 'is_open', 'attributes.Caters',
       'attributes.BikeParking', 'hours.Monday', 'city', 'stars',
       'attributes.RestaurantsTakeOut', 'latitude', 'business_id',
       'hours.Thursday', 'attributes.RestaurantsAttire',
       'attributes.RestaurantsDelivery', 'address', 'attributes.GoodForKids',
       'attributes.GoodForMeal', 'hours', 'label', 'review'],
      dtype='object')

In [6]:
train_data_cleaned[["hours","hours.Tuesday","hours.Sunday"]].head()

Unnamed: 0,hours,hours.Tuesday,hours.Sunday
0,"{'Monday': '16:0-2:0', 'Tuesday': '15:0-2:0', ...",b'15:0-2:0',
1,"{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",b'11:0-21:0',b'11:0-20:30'
2,"{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",b'11:0-21:0',b'11:0-18:0'
3,,,
4,,,


In [7]:
pd.DataFrame(missing_data)

Unnamed: 0,0
id,0.000000
attributes.HappyHour,90.710335
attributes.Ambience,13.472174
hours.Tuesday,20.826132
postal_code,0.000000
...,...
attributes.GoodForKids,6.865710
attributes.GoodForMeal,40.468372
hours,18.263913
label,43.207743


In [8]:
print(train_data_cleaned.columns)
train_data_cleaned = train_data_cleaned.drop(columns=["business_id","address"])
train_data_cleaned.columns

Index(['id', 'attributes.Ambience', 'hours.Tuesday', 'postal_code',
       'attributes.OutdoorSeating', 'hours.Saturday', 'longitude', 'name',
       'attributes.BusinessAcceptsCreditCards',
       'attributes.RestaurantsReservations', 'hours.Friday',
       'attributes.RestaurantsPriceRange2', 'attributes.WiFi',
       'attributes.NoiseLevel', 'state', 'attributes.Alcohol',
       'attributes.HasTV', 'attributes', 'hours.Wednesday', 'hours.Sunday',
       'attributes.RestaurantsGoodForGroups', 'attributes.BusinessParking',
       'review_count', 'is_open', 'attributes.Caters',
       'attributes.BikeParking', 'hours.Monday', 'city', 'stars',
       'attributes.RestaurantsTakeOut', 'latitude', 'business_id',
       'hours.Thursday', 'attributes.RestaurantsAttire',
       'attributes.RestaurantsDelivery', 'address', 'attributes.GoodForKids',
       'attributes.GoodForMeal', 'hours', 'label', 'review'],
      dtype='object')


Index(['id', 'attributes.Ambience', 'hours.Tuesday', 'postal_code',
       'attributes.OutdoorSeating', 'hours.Saturday', 'longitude', 'name',
       'attributes.BusinessAcceptsCreditCards',
       'attributes.RestaurantsReservations', 'hours.Friday',
       'attributes.RestaurantsPriceRange2', 'attributes.WiFi',
       'attributes.NoiseLevel', 'state', 'attributes.Alcohol',
       'attributes.HasTV', 'attributes', 'hours.Wednesday', 'hours.Sunday',
       'attributes.RestaurantsGoodForGroups', 'attributes.BusinessParking',
       'review_count', 'is_open', 'attributes.Caters',
       'attributes.BikeParking', 'hours.Monday', 'city', 'stars',
       'attributes.RestaurantsTakeOut', 'latitude', 'hours.Thursday',
       'attributes.RestaurantsAttire', 'attributes.RestaurantsDelivery',
       'attributes.GoodForKids', 'attributes.GoodForMeal', 'hours', 'label',
       'review'],
      dtype='object')

In [9]:
json_columns = [
    'attributes.Ambience',
    'attributes',
    'attributes.BusinessParking',
    'attributes.GoodForMeal',
    'hours'
]
bool_columns = [
    'attributes.OutdoorSeating',
    'attributes.BusinessAcceptsCreditCards',
    'attributes.RestaurantsReservations',
    'attributes.RestaurantsGoodForGroups',
    'attributes.Caters',
    'attributes.BikeParking',
    'attributes.RestaurantsDelivery',
    'attributes.GoodForKids',
    'is_open',
    'attributes.HasTV',
    'attributes.RestaurantsTakeOut'
]
categorical_columns = [
    'postal_code',
    # 'name',
    'attributes.RestaurantsPriceRange2',
    'attributes.WiFi',
    'attributes.NoiseLevel',
    'state',
    'attributes.Alcohol',
    'attributes.RestaurantsAttire',
    'city'
    # 'label'
]
numeric_columns = [
    'longitude',
    'latitude',
    'stars',
    'review_count'
]
text_columns = ['review','name']

In [10]:
import ast
import json
from tqdm import tqdm

# Function to decode and clean byte strings and convert to appropriate types
def clean_and_convert(value):
    if isinstance(value, str):
        # Remove b'', b"", u'', u"" prefixes and quotes
        if (value.startswith("b'") and value.endswith("'")) or (value.startswith('b"') and value.endswith('"')):
            value = value[2:-1]
        if (value.startswith("u'") and value.endswith("'")) or (value.startswith('u"') and value.endswith('"')):
            value = value[2:-1]
        if (value.startswith("'") and value.endswith("'")) or (value.startswith('"') and value.endswith('"')):
            value = value[1:-1]

        # Handle remaining byte strings
        try:
            eval_value = ast.literal_eval(value)
            if isinstance(eval_value, bytes):
                return eval_value.decode('utf-8')
        except (ValueError, SyntaxError):
            pass

        return value.strip()
    return value

# Apply the clean_and_convert function across the entire dataframe
train_data_cleaned_2 = train_data_cleaned.applymap(clean_and_convert)

# Function to convert JSON-like strings to dictionaries
def parse_json_string(value):
    if isinstance(value, str) and (value.startswith("{") or value.startswith("[")):
        try:
            return json.loads(value.replace("'", '"').replace('True', 'true').replace('False', 'false'))  # Convert single quotes to double for valid JSON
        except json.JSONDecodeError:
            return value
    return value

# Function to convert string representations of boolean values to actual bool
def convert_to_bool(value):
    if isinstance(value, str):
        if value.lower() == 'true':
            return True
        if value.lower() == 'false':
            return False
    return value

# Convert JSON columns
for col in tqdm(json_columns):
    if col in train_data_cleaned_2.columns:
        train_data_cleaned_2[col] = train_data_cleaned_2[col].apply(parse_json_string)

# Convert Bool columns
for col in tqdm(bool_columns):
    if col in train_data_cleaned_2.columns:
        train_data_cleaned_2[col] = train_data_cleaned_2[col].apply(convert_to_bool)

train_data_cleaned_2.replace('None', float('nan'), inplace=True)

# Display the cleaned dataframe
train_data_cleaned_2.head()

100%|██████████| 5/5 [00:00<00:00, 10.14it/s]
100%|██████████| 11/11 [00:00<00:00, 160.70it/s]




Unnamed: 0,id,attributes.Ambience,hours.Tuesday,postal_code,attributes.OutdoorSeating,hours.Saturday,longitude,name,attributes.BusinessAcceptsCreditCards,attributes.RestaurantsReservations,...,attributes.RestaurantsTakeOut,latitude,hours.Thursday,attributes.RestaurantsAttire,attributes.RestaurantsDelivery,attributes.GoodForKids,attributes.GoodForMeal,hours,label,review
0,0,"{'romantic': False, 'intimate': False, 'classy...",15:0-2:0,44107,False,11:30-2:0,-81.820568,Rush Inn,True,False,...,True,41.484197,11:30-2:0,casual,False,False,"{'dessert': False, 'latenight': False, 'lunch'...","{'Monday': '16:0-2:0', 'Tuesday': '15:0-2:0', ...",american (traditional),"So, we stopped here on our way to the Side Que..."
1,1,"{'romantic': False, 'intimate': False, 'touris...",11:0-21:0,85042,True,11:0-20:30,-112.032893,GreenMix,True,False,...,True,33.379283,11:0-21:0,casual,False,True,"{'dessert': False, 'latenight': False, 'lunch'...","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",american (new),This is our go-to healthy spot! The food is al...
2,2,,11:0-21:0,M4M 3G6,,11:0-21:0,-79.339163,BarBurrito - Gerrard,,False,...,False,43.669144,11:0-21:0,,,,,"{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",mexican,Food court meal at Gerrard Square. It's been ...
3,3,"{'romantic': False, 'intimate': False, 'classy...",,89146,False,,-115.242714,SalvaMex,True,False,...,True,36.159527,,casual,True,True,,,mexican,"Located on Rainbow/Charleston, this small fami..."
4,4,"{'romantic': False, 'intimate': False, 'touris...",,44133,False,,-81.726357,Hop Hing,True,False,...,True,41.330546,,casual,False,True,"{'dessert': False, 'latenight': False, 'lunch'...",,chinese,No frills Chinese takeout joint which serves u...


In [11]:
train_data_cleaned["hours.Saturday"][3]
float('nan')

nan

In [12]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

data_encoded = train_data_cleaned_2.copy()
# Factorize the label if not already done
if 'label_encoded' not in data_encoded.columns:
    label_encoder = LabelEncoder()
    data_encoded.loc[train_index, 'label_encoded'] = label_encoder.fit_transform(data_encoded.loc[train_index, 'label'])

# Ensure 'label' is included for chi-square test
categorical_columns_with_label = categorical_columns + ['label']

# Convert categorical features to category dtype if they are not already
for col in categorical_columns_with_label:
    data_encoded[col] = data_encoded[col].astype('category')

train_data_encoded = data_encoded.loc[train_index]
data_encoded.head()



Unnamed: 0,id,attributes.Ambience,hours.Tuesday,postal_code,attributes.OutdoorSeating,hours.Saturday,longitude,name,attributes.BusinessAcceptsCreditCards,attributes.RestaurantsReservations,...,latitude,hours.Thursday,attributes.RestaurantsAttire,attributes.RestaurantsDelivery,attributes.GoodForKids,attributes.GoodForMeal,hours,label,review,label_encoded
0,0,"{'romantic': False, 'intimate': False, 'classy...",15:0-2:0,44107,False,11:30-2:0,-81.820568,Rush Inn,True,False,...,41.484197,11:30-2:0,casual,False,False,"{'dessert': False, 'latenight': False, 'lunch'...","{'Monday': '16:0-2:0', 'Tuesday': '15:0-2:0', ...",american (traditional),"So, we stopped here on our way to the Side Que...",1.0
1,1,"{'romantic': False, 'intimate': False, 'touris...",11:0-21:0,85042,True,11:0-20:30,-112.032893,GreenMix,True,False,...,33.379283,11:0-21:0,casual,False,True,"{'dessert': False, 'latenight': False, 'lunch'...","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",american (new),This is our go-to healthy spot! The food is al...,0.0
2,2,,11:0-21:0,M4M 3G6,,11:0-21:0,-79.339163,BarBurrito - Gerrard,,False,...,43.669144,11:0-21:0,,,,,"{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",mexican,Food court meal at Gerrard Square. It's been ...,8.0
3,3,"{'romantic': False, 'intimate': False, 'classy...",,89146,False,,-115.242714,SalvaMex,True,False,...,36.159527,,casual,True,True,,,mexican,"Located on Rainbow/Charleston, this small fami...",8.0
4,4,"{'romantic': False, 'intimate': False, 'touris...",,44133,False,,-81.726357,Hop Hing,True,False,...,41.330546,,casual,False,True,"{'dessert': False, 'latenight': False, 'lunch'...",,chinese,No frills Chinese takeout joint which serves u...,4.0


### Numeric Conclusion
Based on these results, **latitude** and **longitude** are the most useful numeric features for predicting the label. **stars** might still be useful but to a lesser extent, and **review_count** appears to be the least useful feature. These insights can guide you in selecting and prioritizing features for your predictive models.

In [None]:
# Correlation Analysis
correlations = train_data_encoded[numeric_columns].corrwith(train_data_encoded['label_encoded'])
print("Correlation with target label:")
print(correlations)

from scipy.stats import chi2_contingency

def chi_square_test(df, feature, target):
    contingency_table = pd.crosstab(df[feature], df[target])
    _, p_value, _, _ = chi2_contingency(contingency_table)
    return p_value

# Discretize numeric features for chi-square test
for col in numeric_columns:
    train_data_encoded[col+'_binned'] = pd.qcut(train_data_encoded[col], q=10, duplicates='drop')

chi_square_results = {col: chi_square_test(train_data_encoded, col+'_binned', 'label_encoded') for col in numeric_columns}
print("Chi-square test p-values:")
print(chi_square_results)

from sklearn.ensemble import RandomForestClassifier

# Prepare the data
X = train_data_encoded[numeric_columns]
y = train_data_encoded['label_encoded']

# Train a lightweight RandomForest model
model = RandomForestClassifier(n_estimators=10, random_state=42)
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_
feature_importance = pd.Series(importances, index=X.columns)
print("Feature Importance from RandomForest:")
print(feature_importance.sort_values(ascending=False))

from sklearn.feature_selection import mutual_info_classif

# Calculate mutual information
mutual_info = mutual_info_classif(X, y)
mutual_info_series = pd.Series(mutual_info, index=X.columns)
print("Mutual Information with target label:")
print(mutual_info_series.sort_values(ascending=False))


Correlation with target label:
longitude       0.167513
latitude        0.296189
stars           0.107084
review_count   -0.038622
dtype: float64
Chi-square test p-values:
{'longitude': 0.0, 'latitude': 0.0, 'stars': 4.7661634178532754e-123, 'review_count': 7.219493165138911e-58}
Feature Importance from RandomForest:
latitude        0.340751
longitude       0.327411
review_count    0.254863
stars           0.076975
dtype: float64
Mutual Information with target label:
latitude        0.242165
longitude       0.221670
review_count    0.022676
stars           0.021680
dtype: float64


### Categorical Actionable Insights

- **Focus on Significant Features**: Based on chi-square, RandomForest, and mutual information results, focus on the most significant features (e.g., state, city, attire, price range, noise level).
- **Exclude Non-informative Features**: Features with zero mutual information or low importance scores can be excluded to simplify the model.
- **Use Imputation**: Ensure proper handling of NaNs to maintain data integrity and avoid biases in model training.

We consider these are significant
```
    'attributes.RestaurantsPriceRange2',
    'attributes.WiFi',
    'attributes.NoiseLevel',
    'state',
    'attributes.Alcohol',
    'attributes.RestaurantsAttire', >
    'city'
```

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

verify_categorical_columns = categorical_columns.copy()
verify_categorical_columns.remove('postal_code')

imputer = SimpleImputer(strategy='constant', fill_value='missing')
train_data_encoded[verify_categorical_columns] = imputer.fit_transform(train_data_encoded[verify_categorical_columns])

# Chi-Square Test
chi_square_results = {col: chi2_contingency(pd.crosstab(train_data_encoded[col], train_data_encoded['label_encoded']))[1] for col in verify_categorical_columns}
print("Chi-square test p-values:")
print(chi_square_results)

# Feature Importance from RandomForest
# Prepare the data
X_cat = train_data_encoded[verify_categorical_columns]
y_cat = train_data_encoded['label_encoded']

# One-hot encode categorical variables
onehot_encoder = OneHotEncoder(drop='first', sparse=False)
X_encoded = onehot_encoder.fit_transform(X_cat)

# Train a lightweight RandomForest model
model = RandomForestClassifier(n_estimators=10, random_state=42)
model.fit(X_encoded, y_cat)

# Get feature importances
importances = model.feature_importances_
feature_names = onehot_encoder.get_feature_names_out(verify_categorical_columns)
feature_importance = pd.Series(importances, index=feature_names)
print("Feature Importance from RandomForest:")
print(feature_importance.sort_values(ascending=False))

# Mutual Information
# Calculate mutual information
mutual_info = mutual_info_classif(X_encoded, y_cat)
mutual_info_series = pd.Series(mutual_info, index=feature_names)
print("Mutual Information with target label:")
print(mutual_info_series.sort_values(ascending=False))

Chi-square test p-values:
{'attributes.RestaurantsPriceRange2': 0.0, 'attributes.WiFi': 9.809474607064422e-168, 'attributes.NoiseLevel': 3.0513060833241326e-127, 'state': 0.0, 'attributes.Alcohol': 0.0, 'attributes.RestaurantsAttire': 1.8768405102084909e-65, 'city': 0.0}
Feature Importance from RandomForest:
attributes.RestaurantsPriceRange2_2    0.063472
attributes.NoiseLevel_average          0.036363
attributes.NoiseLevel_quiet            0.034666
attributes.Alcohol_full_bar            0.034588
attributes.WiFi_no                     0.031081
                                         ...   
city_Ambridge                          0.000031
city_Saint-L\xc3\xa9onard              0.000025
city_Mt. Holly                         0.000021
city_Belleville                        0.000015
city_Munhall                           0.000015
Length: 456, dtype: float64




Mutual Information with target label:
state_ON                       0.075390
attributes.Alcohol_full_bar    0.059457
state_AZ                       0.035354
city_Toronto                   0.033850
state_QC                       0.028749
                                 ...   
city_Munhall                   0.000000
city_Munroe Falls              0.000000
city_Murrysville               0.000000
city_NELLIS AFB                0.000000
city_springdale                0.000000
Length: 456, dtype: float64


In [None]:
unique_values_counts = {col: data_encoded[col].nunique() for col in categorical_columns}
unique_values_counts

{'postal_code': 5006,
 'attributes.RestaurantsPriceRange2': 5,
 'attributes.WiFi': 4,
 'attributes.NoiseLevel': 5,
 'state': 14,
 'attributes.Alcohol': 4,
 'attributes.RestaurantsAttire': 4,
 'city': 511}

In [None]:
print(feature_importance.sort_values(ascending=False)[:50])
print(feature_importance.sort_values(ascending=False)[-20:])

attributes.RestaurantsAttire_u'casual'       0.040051
attributes.RestaurantsPriceRange2_2          0.029663
attributes.NoiseLevel_u'average'             0.027957
attributes.WiFi_u'no'                        0.026195
attributes.WiFi_u'free'                      0.022060
attributes.WiFi_'no'                         0.021381
attributes.Alcohol_u'full_bar'               0.020342
attributes.NoiseLevel_u'quiet'               0.019132
attributes.WiFi_missing                      0.018021
attributes.NoiseLevel_missing                0.015508
attributes.Alcohol_u'none'                   0.015292
attributes.Alcohol_'none'                    0.012281
attributes.Alcohol_u'beer_and_wine'          0.012134
attributes.Alcohol_'full_bar'                0.011525
attributes.Alcohol_missing                   0.011436
state_ON                                     0.011116
attributes.RestaurantsAttire_missing         0.009559
attributes.NoiseLevel_u'loud'                0.008665
attributes.RestaurantsPriceR

In [None]:
print(mutual_info_series.sort_values(ascending=False)[:50])
print(mutual_info_series.sort_values(ascending=False)[-20:])

state_ON                                 0.075144
attributes.Alcohol_u'full_bar'           0.043597
city_Toronto                             0.035341
attributes.Alcohol_u'none'               0.031361
state_AZ                                 0.028128
state_QC                                 0.025474
city_Markham                             0.024472
postal_code_M6G 1K4                      0.021831
city_Phoenix                             0.021465
postal_code_L4B 3K4                      0.020409
attributes.NoiseLevel_u'quiet'           0.020284
attributes.Alcohol_u'beer_and_wine'      0.020046
city_Mentor On The Lake                  0.018825
postal_code_85207                        0.018553
postal_code_M6K 1X4                      0.018433
city_Aurora                              0.018311
attributes.RestaurantsAttire_'dressy'    0.018195
postal_code_L5G 2T2                      0.018070
postal_code_M5V 1J5                      0.018055
postal_code_H4C 2G3                      0.017512


In [None]:
verify_categorical_columns = categorical_columns.copy()
verify_categorical_columns.remove('postal_code')
verify_categorical_columns, categorical_columns

(['attributes.RestaurantsPriceRange2',
  'attributes.WiFi',
  'attributes.NoiseLevel',
  'state',
  'attributes.Alcohol',
  'attributes.RestaurantsAttire',
  'city'],
 ['postal_code',
  'attributes.RestaurantsPriceRange2',
  'attributes.WiFi',
  'attributes.NoiseLevel',
  'state',
  'attributes.Alcohol',
  'attributes.RestaurantsAttire',
  'city'])

In [None]:
pd.DataFrame(feature_importance)

Unnamed: 0,0
attributes.RestaurantsPriceRange2_2,0.063472
attributes.RestaurantsPriceRange2_3,0.023369
attributes.RestaurantsPriceRange2_4,0.005397
attributes.RestaurantsPriceRange2_None,0.000428
attributes.RestaurantsPriceRange2_missing,0.016079
...,...
city_Woodbridge,0.002921
city_Woodmere,0.000751
city_York,0.000577
city_Youngtown,0.000366


In [None]:
pd.DataFrame(mutual_info_series)

Unnamed: 0,0
attributes.RestaurantsPriceRange2_2,0.026157
attributes.RestaurantsPriceRange2_3,0.010155
attributes.RestaurantsPriceRange2_4,0.002663
attributes.RestaurantsPriceRange2_None,0.000000
attributes.RestaurantsPriceRange2_missing,0.006334
...,...
city_Woodbridge,0.000000
city_Woodmere,0.000000
city_York,0.002712
city_Youngtown,0.000000


### Bool Conclusion
All boolean features tested show high significance with chi-square p-values, suggesting they are statistically significant for predicting the target label. However, based on mutual information and RandomForest feature importance, features like `attributes.BusinessAcceptsCreditCards_True`, `attributes.RestaurantsDelivery_True`, `attributes.OutdoorSeating_True`, `attributes.RestaurantsReservations_True`, and `attributes.GoodForKids_True` are particularly significant and should be prioritized in your predictive modeling.

Features with lower mutual information scores, such as `attributes.BikeParking_True`, `attributes.Caters_True`, `attributes.RestaurantsGoodForGroups_True`, `attributes.RestaurantsTakeOut_True`, and `is_open_1`, although still significant, might be considered less critical.

In [None]:
# Calculate the count of unique values for each boolean column
unique_values_counts_bool = {col: train_data_encoded[col].nunique() for col in bool_columns}

# Print the count of unique values for each boolean column
print("Count of unique values in each boolean column:")
for col, count in unique_values_counts_bool.items():
    print(f"{col}: {count} unique values")

Count of unique values in each boolean column:
attributes.OutdoorSeating: 2 unique values
attributes.BusinessAcceptsCreditCards: 2 unique values
attributes.RestaurantsReservations: 2 unique values
attributes.RestaurantsGoodForGroups: 2 unique values
attributes.Caters: 2 unique values
attributes.BikeParking: 2 unique values
attributes.RestaurantsDelivery: 2 unique values
attributes.GoodForKids: 2 unique values
is_open: 2 unique values
attributes.HasTV: 2 unique values
attributes.RestaurantsTakeOut: 2 unique values


In [None]:
train_data_encoded["attributes.OutdoorSeating"].unique()

array([False, True, nan], dtype=object)

In [None]:
# Impute NaNs with a placeholder (False)
imputer_bool = SimpleImputer(strategy='constant', fill_value=False)
train_data_encoded[bool_columns] = imputer_bool.fit_transform(train_data_encoded[bool_columns])

# Chi-Square Test for Boolean Columns
chi_square_results_bool = {col: chi2_contingency(pd.crosstab(train_data_encoded[col], train_data_encoded['label_encoded']))[1] for col in bool_columns}
print("Chi-square test p-values for boolean columns:")
print(chi_square_results_bool)

# Feature Importance from RandomForest for Boolean Columns
# Prepare the data
X_bool = train_data_encoded[bool_columns]
y_bool = train_data_encoded['label_encoded']

# One-hot encode boolean variables (if necessary)
onehot_encoder_bool = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore')
X_bool_encoded = onehot_encoder_bool.fit_transform(X_bool)

# Train a lightweight RandomForest model
model_bool = RandomForestClassifier(n_estimators=10, random_state=42)
model_bool.fit(X_bool_encoded, y_bool)

# Get feature importances
importances_bool = model_bool.feature_importances_
feature_names_bool = onehot_encoder_bool.get_feature_names_out(bool_columns)
feature_importance_bool = pd.Series(importances_bool, index=feature_names_bool)
print("Feature Importance from RandomForest for boolean columns:")
print(feature_importance_bool.sort_values(ascending=False))

# Mutual Information for Boolean Columns
# Calculate mutual information
mutual_info_bool = mutual_info_classif(X_bool_encoded, y_bool)
mutual_info_series_bool = pd.Series(mutual_info_bool, index=feature_names_bool)
print("Mutual Information with target label for boolean columns:")
print(mutual_info_series_bool.sort_values(ascending=False))

Chi-square test p-values for boolean columns:
{'attributes.OutdoorSeating': 8.784720429369697e-281, 'attributes.BusinessAcceptsCreditCards': 0.0, 'attributes.RestaurantsReservations': 4.546218140048973e-203, 'attributes.RestaurantsGoodForGroups': 3.975369606035848e-21, 'attributes.Caters': 2.7339117841727853e-84, 'attributes.BikeParking': 2.6106532775479318e-19, 'attributes.RestaurantsDelivery': 4.833173156595177e-283, 'attributes.GoodForKids': 2.2008273398327175e-144, 'is_open': 3.927700966843965e-13, 'attributes.HasTV': 8.330256894683762e-132, 'attributes.RestaurantsTakeOut': 5.193540162326398e-67}
Feature Importance from RandomForest for boolean columns:
attributes.BusinessAcceptsCreditCards_True    0.129642
attributes.RestaurantsReservations_True       0.100268
attributes.RestaurantsDelivery_True           0.099313
attributes.OutdoorSeating_True                0.095228
is_open_1                                     0.093104
attributes.GoodForKids_True                   0.092975
attr

# Bert finetune

In [None]:
data_encoded.head()



Unnamed: 0,id,attributes.Ambience,hours.Tuesday,postal_code,attributes.OutdoorSeating,hours.Saturday,longitude,name,attributes.BusinessAcceptsCreditCards,attributes.RestaurantsReservations,...,latitude,hours.Thursday,attributes.RestaurantsAttire,attributes.RestaurantsDelivery,attributes.GoodForKids,attributes.GoodForMeal,hours,label,review,label_encoded
0,0,"{'romantic': False, 'intimate': False, 'classy...",15:0-2:0,44107,False,11:30-2:0,-81.820568,Rush Inn,True,False,...,41.484197,11:30-2:0,casual,False,False,"{'dessert': False, 'latenight': False, 'lunch'...","{'Monday': '16:0-2:0', 'Tuesday': '15:0-2:0', ...",american (traditional),"So, we stopped here on our way to the Side Que...",1.0
1,1,"{'romantic': False, 'intimate': False, 'touris...",11:0-21:0,85042,True,11:0-20:30,-112.032893,GreenMix,True,False,...,33.379283,11:0-21:0,casual,False,True,"{'dessert': False, 'latenight': False, 'lunch'...","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",american (new),This is our go-to healthy spot! The food is al...,0.0
2,2,,11:0-21:0,M4M 3G6,,11:0-21:0,-79.339163,BarBurrito - Gerrard,,False,...,43.669144,11:0-21:0,,,,,"{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",mexican,Food court meal at Gerrard Square. It's been ...,8.0
3,3,"{'romantic': False, 'intimate': False, 'classy...",,89146,False,,-115.242714,SalvaMex,True,False,...,36.159527,,casual,True,True,,,mexican,"Located on Rainbow/Charleston, this small fami...",8.0
4,4,"{'romantic': False, 'intimate': False, 'touris...",,44133,False,,-81.726357,Hop Hing,True,False,...,41.330546,,casual,False,True,"{'dessert': False, 'latenight': False, 'lunch'...",,chinese,No frills Chinese takeout joint which serves u...,4.0


In [None]:
from sklearn.model_selection import train_test_split

# Split train_index into training and validation sets
train_idx, val_idx = train_test_split(train_index, test_size=0.2, random_state=42)

# Create DataFrame for training and validation sets
train_data = data_encoded.loc[train_idx]
val_data = data_encoded.loc[val_idx]

# Extract reviews and labels
reviews = data_encoded['review']
labels = data_encoded['label_encoded']
labels.unique()

array([ 1.,  0.,  8.,  4.,  7.,  5.,  9.,  2.,  3.,  6., nan])

In [None]:
! rm -rf bert-finetuned/

In [None]:
!lscpu
!nvidia-smi

Architecture:             x86_64
  CPU op-mode(s):         32-bit, 64-bit
  Address sizes:          46 bits physical, 48 bits virtual
  Byte Order:             Little Endian
CPU(s):                   16
  On-line CPU(s) list:    0-15
Vendor ID:                GenuineIntel
  Model name:             Intel(R) Xeon(R) CPU @ 2.20GHz
    CPU family:           6
    Model:                85
    Thread(s) per core:   2
    Core(s) per socket:   8
    Socket(s):            1
    Stepping:             7
    BogoMIPS:             4400.46
    Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 cl
                          flush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc re
                          p_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3
                           fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand
                           hypervisor lahf_lm abm 3dnowprefetch

In [None]:
import os
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import f1_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

num_labels = len(labels.unique())-1

# Define a custom dataset
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Check if a saved model exists and decide whether to load it or train a new one
model_path = './bert-finetuned'
continue_epoch = 0
saved_model_path = os.path.join(data_path, 'bert-finetuned', f'checkpoint-epoch-{continue_epoch}')
use_saved_model = os.path.exists(saved_model_path)
if use_saved_model:
    print("Loaded saved model from", saved_model_path)
    model = BertForSequenceClassification.from_pretrained(saved_model_path)
    tokenizer = BertTokenizer.from_pretrained(saved_model_path)
    print(f"Continuing training from epoch {continue_epoch}")
else:
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
    print("Training a new model")

# if use_saved_model:
#     model = BertForSequenceClassification.from_pretrained(model_path)
#     print("Loaded saved model from", model_path)
#     # Check if there's a specific epoch checkpoint to load
#     for epoch in range(5, 100, 5):  # Assuming max epochs 100
#         checkpoint_path = os.path.join(model_path, f'checkpoint-epoch-{epoch}')
#         if os.path.exists(checkpoint_path):
#             # model = BertForSequenceClassification.from_pretrained(checkpoint_path)
#             # tokenizer = BertTokenizer.from_pretrained(checkpoint_path)
#             continue_epoch = epoch
#             # print(f"Continuing training from epoch {epoch}")
#     if continue_epoch != 0:
#         checkpoint_path = os.path.join(model_path, f'checkpoint-epoch-{continue_epoch}')
#         model = BertForSequenceClassification.from_pretrained(checkpoint_path)
#         tokenizer = BertTokenizer.from_pretrained(checkpoint_path)
#         print(f"Continuing training from epoch {continue_epoch}")
# else:
#     model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
#     print("Training a new model")

# Create DataLoader for training and validation sets
train_dataset = ReviewDataset(
    reviews=train_data['review'].tolist(),
    labels=train_data['label_encoded'].tolist(),
    tokenizer=tokenizer,
    max_len=128
)

val_dataset = ReviewDataset(
    reviews=val_data['review'].tolist(),
    labels=val_data['label_encoded'].tolist(),
    tokenizer=tokenizer,
    max_len=128
)

# Define training arguments
training_args = TrainingArguments(
    output_dir=model_path,
    num_train_epochs=5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    dataloader_num_workers=16,
    weight_decay=0.05,  # Add weight decay for regularization
)

# Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='micro')
    return {
        'f1 micro': f1,
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model if we are not using a saved model
if not use_saved_model or continue_epoch < training_args.num_train_epochs:
    if continue_epoch > 0:
        trainer.train(resume_from_checkpoint=os.path.join(model_path, f'checkpoint-{continue_epoch*165}'))
    else:
        trainer.train()

    if False:
        checkpoint_dir = os.path.join(data_path, 'bert-finetuned', f'checkpoint-epoch-{int(trainer.state.epoch)}')
        trainer.save_model(checkpoint_dir)
        tokenizer.save_pretrained(checkpoint_dir)
        print(f"Saved model checkpoint to {checkpoint_dir}")

# Evaluate the model on the validation set
eval_results = trainer.evaluate(eval_dataset=val_dataset)

# Print the F1 score
print(f"Validation F1 Score: {eval_results['eval_f1 micro']:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training a new model


  self.pid = os.fork()


Epoch,Training Loss,Validation Loss,F1 micro
1,No log,0.989896,0.686953
2,1.259200,0.878316,0.705211
3,0.785700,0.894297,0.709395
4,0.617500,0.907222,0.712438
5,0.507400,0.918971,0.710536


  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


Validation F1 Score: 0.7052


In [None]:
checkpoint_dir = os.path.join(data_path, 'bert-finetuned', f'checkpoint-epoch-{int(trainer.state.epoch)}')
trainer.save_model(checkpoint_dir)
tokenizer.save_pretrained(checkpoint_dir)
print(f"Saved model checkpoint to {checkpoint_dir}")

Saved model checkpoint to /content/drive/MyDrive/Colab Notebooks/DSC 253 - Adv Data-Driven Text Mining/Challenge/bert-finetuned/checkpoint-epoch-5


# Hybrid model
## Convert to dataframe

In [13]:
used_json_columns = [
    'attributes.Ambience',
    # 'attributes',
    'attributes.BusinessParking',
    'attributes.GoodForMeal',
    'hours'
]
used_bool_columns = [
    'attributes.OutdoorSeating',
    'attributes.BusinessAcceptsCreditCards',
    'attributes.RestaurantsReservations',
    'attributes.RestaurantsGoodForGroups',
    'attributes.Caters',
    'attributes.BikeParking',
    'attributes.RestaurantsDelivery',
    'attributes.GoodForKids',
    'is_open',
    'attributes.HasTV',
    'attributes.RestaurantsTakeOut'
]
used_categorical_columns = [
    'attributes.RestaurantsPriceRange2',
    'attributes.WiFi',
    'attributes.NoiseLevel',
    'state',
    'attributes.Alcohol',
    'attributes.RestaurantsAttire',
    # 'city'
]
used_numeric_columns = [
    'longitude',
    'latitude',
    'stars',
    # 'review_count'
]
used_text_columns = ['review','name']

In [14]:
data_encoded[used_json_columns].head()

Unnamed: 0,attributes.Ambience,attributes.BusinessParking,attributes.GoodForMeal,hours
0,"{'romantic': False, 'intimate': False, 'classy...","{'garage': False, 'street': True, 'validated':...","{'dessert': False, 'latenight': False, 'lunch'...","{'Monday': '16:0-2:0', 'Tuesday': '15:0-2:0', ..."
1,"{'romantic': False, 'intimate': False, 'touris...","{'garage': False, 'street': False, 'validated'...","{'dessert': False, 'latenight': False, 'lunch'...","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'..."
2,,,,"{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'..."
3,"{'romantic': False, 'intimate': False, 'classy...","{'garage': False, 'street': False, 'validated'...",,
4,"{'romantic': False, 'intimate': False, 'touris...","{'garage': False, 'street': False, 'validated'...","{'dessert': False, 'latenight': False, 'lunch'...",


In [15]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Split train_index into training and validation sets
train_idx, val_idx = train_test_split(train_index, test_size=0.2, random_state=42)

# Extract labels
labels = data_encoded['label_encoded']
labels.unique()

# Extract text columns
# data_encoded['merged_review'] = data_encoded.apply(
#     lambda row: f"{row['name']}, from {row['city']}, {row['state']}: {row['review']}", axis=1)
data_encoded['merged_review'] = data_encoded.apply(
    lambda row: f"{row['name']}: {row['review']}", axis=1)
reviews = data_encoded['merged_review']

# Extract boolean columns
imputer_bool = SimpleImputer(strategy='constant', fill_value=0.5)
data_encoded[used_bool_columns] = imputer_bool.fit_transform(data_encoded[used_bool_columns])
bool_data = data_encoded[used_bool_columns].astype(float)

# One-hot encode categorical columns
onehot_encoder = OneHotEncoder(sparse=False)
imputer_cat = SimpleImputer(strategy='constant', fill_value='missing')
data_encoded[used_categorical_columns] = imputer_cat.fit_transform(data_encoded[used_categorical_columns])
categorical_data = onehot_encoder.fit_transform(data_encoded[used_categorical_columns])

# Normalize numeric columns
scaler = StandardScaler()
numeric_data = scaler.fit_transform(data_encoded[used_numeric_columns])

# Combine all additional features
additional_features = pd.concat([
    bool_data,
    pd.DataFrame(categorical_data, columns=onehot_encoder.get_feature_names_out(used_categorical_columns)),
    pd.DataFrame(numeric_data, columns=used_numeric_columns)
], axis=1)
additional_features





Unnamed: 0,attributes.OutdoorSeating,attributes.BusinessAcceptsCreditCards,attributes.RestaurantsReservations,attributes.RestaurantsGoodForGroups,attributes.Caters,attributes.BikeParking,attributes.RestaurantsDelivery,attributes.GoodForKids,is_open,attributes.HasTV,...,attributes.Alcohol_full_bar,attributes.Alcohol_missing,attributes.Alcohol_none,attributes.RestaurantsAttire_casual,attributes.RestaurantsAttire_dressy,attributes.RestaurantsAttire_formal,attributes.RestaurantsAttire_missing,longitude,latitude,stars
0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.736761,0.432861,0.745554
1,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,-1.087209,-1.230437,0.042727
2,0.5,0.5,0.0,0.5,0.5,0.5,0.5,0.5,1.0,0.5,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.886568,0.881258,-0.660100
3,0.0,1.0,0.0,1.0,1.0,0.5,1.0,1.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,-1.280991,-0.659873,0.745554
4,0.0,1.0,0.0,0.0,0.5,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.742449,0.401329,0.042727
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23139,1.0,1.0,0.0,1.0,1.0,0.5,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.795606,-0.893170,0.745554
23140,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.080086,-1.227520,0.042727
23141,0.0,1.0,0.0,1.0,0.5,0.5,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.272848,-0.677820,0.042727
23142,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.109012,-1.214611,0.042727


In [16]:
additional_features.columns

Index(['attributes.OutdoorSeating', 'attributes.BusinessAcceptsCreditCards',
       'attributes.RestaurantsReservations',
       'attributes.RestaurantsGoodForGroups', 'attributes.Caters',
       'attributes.BikeParking', 'attributes.RestaurantsDelivery',
       'attributes.GoodForKids', 'is_open', 'attributes.HasTV',
       'attributes.RestaurantsTakeOut', 'attributes.RestaurantsPriceRange2_1',
       'attributes.RestaurantsPriceRange2_2',
       'attributes.RestaurantsPriceRange2_3',
       'attributes.RestaurantsPriceRange2_4',
       'attributes.RestaurantsPriceRange2_missing', 'attributes.WiFi_free',
       'attributes.WiFi_missing', 'attributes.WiFi_no', 'attributes.WiFi_paid',
       'attributes.NoiseLevel_average', 'attributes.NoiseLevel_loud',
       'attributes.NoiseLevel_missing', 'attributes.NoiseLevel_quiet',
       'attributes.NoiseLevel_very_loud', 'state_AB', 'state_AZ', 'state_FL',
       'state_IL', 'state_NC', 'state_NV', 'state_NY', 'state_OH', 'state_ON',
       's

In [17]:
reviews

0        Rush Inn: So, we stopped here on our way to th...
1        GreenMix: This is our go-to healthy spot! The ...
2        BarBurrito - Gerrard: Food court meal at Gerra...
3        SalvaMex: Located on Rainbow/Charleston, this ...
4        Hop Hing: No frills Chinese takeout joint whic...
                               ...                        
23139    Mexquite Mexican Eatery: awesome food, great p...
23140    Sweet Tomatoes: What a great experience!!  I l...
23141    Brewery Bar & Grill: I love hanging out at thi...
23142    Augie's Sports Grill: Here is the scenario... ...
23143    La Cabane \xc3\xa0 Sucre Constantin: The good ...
Name: merged_review, Length: 23144, dtype: object

In [18]:
from collections import defaultdict

def time_to_float(time_str):
    hours, minutes = map(int, time_str.split(':'))
    return hours + minutes / 60.0

# Function to convert JSON-like column to 0/1 representation and add existence column
def convert_json_column(df, column):
    print(column + ':')
    count_missing = defaultdict(int)
    json_keys = set()
    existence_col = []

    # Extract unique keys from JSON-like strings
    for item in df[column].dropna():
        # print(item)
        json_obj = item
        json_keys.update(json_obj.keys())

    print(json_keys)
    # Special
    if 'divey' in json_keys:
        json_keys.remove('divey')
    # Create a new DataFrame for the JSON column
    json_df = pd.DataFrame(0, index=df.index, columns=[f'{column}_{key}' for key in json_keys])

    # Populate the new DataFrame
    for idx, item in df[column].items():
        if pd.isna(item):
            existence_col.append(0)
            json_obj = dict()
        else:
            json_obj = item
            existence_col.append(1)
        if column == 'hours':
            for key in json_keys:
                if key not in json_obj:
                    count_missing[key] += 1
                    json_df.at[idx, f'{column}_{key}_0'] = 0
                    json_df.at[idx, f'{column}_{key}_1'] = 0
                else:
                    start_time, end_time = json_obj[key].split('-')
                    start_time_float = time_to_float(start_time)
                    end_time_float = time_to_float(end_time)
                    if end_time_float < start_time_float:
                        end_time_float += 24  # Adjust for times past midnight
                    json_df.at[idx, f'{column}_{key}_0'] = start_time_float
                    json_df.at[idx, f'{column}_{key}_1'] = end_time_float
        else:
            for key in json_keys:
                if key not in json_obj:
                    count_missing[key] += 1
                    json_df.at[idx, f'{column}_{key}'] = 0.5
                else:
                    json_df.at[idx, f'{column}_{key}'] = float(json_obj[key])

    # Add the existence column
    json_df[f'{column}_exists'] = existence_col
    print(count_missing)

    return json_df

# Create a list to hold all the new DataFrames
new_dataframes = []

# Convert each JSON column and store the resulting DataFrame
for col in used_json_columns:
    new_df = convert_json_column(data_encoded, col)
    new_dataframes.append(new_df)

# Merge all new DataFrames back into the original DataFrame
json_features = pd.concat(new_dataframes, axis=1)
json_features.head()

attributes.Ambience:
{'upscale', 'trendy', 'classy', 'touristy', 'romantic', 'hipster', 'casual', 'intimate', 'divey'}
defaultdict(<class 'int'>, {'upscale': 3399, 'trendy': 3324, 'classy': 3324, 'touristy': 3324, 'romantic': 3324, 'hipster': 3488, 'casual': 3324, 'intimate': 3324})
attributes.BusinessParking:
{'validated', 'lot', 'garage', 'valet', 'street'}
defaultdict(<class 'int'>, {'validated': 3086, 'lot': 3086, 'garage': 3085, 'valet': 3086, 'street': 3086})
attributes.GoodForMeal:
{'lunch', 'breakfast', 'dinner', 'dessert', 'brunch', 'latenight'}
defaultdict(<class 'int'>, {'lunch': 9447, 'breakfast': 9447, 'dinner': 9447, 'dessert': 9447, 'brunch': 9447, 'latenight': 9447})
hours:
{'Thursday', 'Sunday', 'Tuesday', 'Friday', 'Saturday', 'Wednesday', 'Monday'}
defaultdict(<class 'int'>, {'Sunday': 6940, 'Thursday': 4325, 'Tuesday': 4820, 'Friday': 4289, 'Saturday': 4624, 'Wednesday': 4438, 'Monday': 5988})


Unnamed: 0,attributes.Ambience_upscale,attributes.Ambience_trendy,attributes.Ambience_classy,attributes.Ambience_touristy,attributes.Ambience_romantic,attributes.Ambience_hipster,attributes.Ambience_casual,attributes.Ambience_intimate,attributes.Ambience_exists,attributes.BusinessParking_validated,...,hours_Tuesday_1,hours_Friday_0,hours_Friday_1,hours_Saturday_0,hours_Saturday_1,hours_Wednesday_0,hours_Wednesday_1,hours_Monday_0,hours_Monday_1,hours_exists
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,...,26.0,11.5,26.0,11.5,26.0,15.0,26.0,16.0,26.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,...,21.0,11.0,21.0,11.0,20.5,11.0,21.0,11.0,21.0,1
2,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0,0.5,...,21.0,11.0,21.0,11.0,21.0,11.0,21.0,11.0,21.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [19]:
json_features.columns

Index(['attributes.Ambience_upscale', 'attributes.Ambience_trendy',
       'attributes.Ambience_classy', 'attributes.Ambience_touristy',
       'attributes.Ambience_romantic', 'attributes.Ambience_hipster',
       'attributes.Ambience_casual', 'attributes.Ambience_intimate',
       'attributes.Ambience_exists', 'attributes.BusinessParking_validated',
       'attributes.BusinessParking_lot', 'attributes.BusinessParking_garage',
       'attributes.BusinessParking_valet', 'attributes.BusinessParking_street',
       'attributes.BusinessParking_exists', 'attributes.GoodForMeal_lunch',
       'attributes.GoodForMeal_breakfast', 'attributes.GoodForMeal_dinner',
       'attributes.GoodForMeal_dessert', 'attributes.GoodForMeal_brunch',
       'attributes.GoodForMeal_latenight', 'attributes.GoodForMeal_exists',
       'hours_Thursday', 'hours_Sunday', 'hours_Tuesday', 'hours_Friday',
       'hours_Saturday', 'hours_Wednesday', 'hours_Monday', 'hours_Thursday_0',
       'hours_Thursday_1', 'hours

In [20]:
final_additional_features = pd.concat([
    additional_features,
    json_features
], axis=1)
print(f"Total number of NaN values: {final_additional_features.isna().sum().sum()}")

# Create DataFrame for training and validation sets
train_data = data_encoded.loc[train_idx]
val_data = data_encoded.loc[val_idx]
additional_features_train = final_additional_features.loc[train_idx]
additional_features_val = final_additional_features.loc[val_idx]

final_additional_features.head()

Total number of NaN values: 0


Unnamed: 0,attributes.OutdoorSeating,attributes.BusinessAcceptsCreditCards,attributes.RestaurantsReservations,attributes.RestaurantsGoodForGroups,attributes.Caters,attributes.BikeParking,attributes.RestaurantsDelivery,attributes.GoodForKids,is_open,attributes.HasTV,...,hours_Tuesday_1,hours_Friday_0,hours_Friday_1,hours_Saturday_0,hours_Saturday_1,hours_Wednesday_0,hours_Wednesday_1,hours_Monday_0,hours_Monday_1,hours_exists
0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,26.0,11.5,26.0,11.5,26.0,15.0,26.0,16.0,26.0,1
1,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,...,21.0,11.0,21.0,11.0,20.5,11.0,21.0,11.0,21.0,1
2,0.5,0.5,0.0,0.5,0.5,0.5,0.5,0.5,1.0,0.5,...,21.0,11.0,21.0,11.0,21.0,11.0,21.0,11.0,21.0,1
3,0.0,1.0,0.0,1.0,1.0,0.5,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,1.0,0.0,0.0,0.5,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


## Model

In [28]:
# !pip install -Uqq ipdb
# import ipdb, pdb
# %pdb on

Automatic pdb calling has been turned ON


In [None]:
!cp -r ./bert-finetuned/checkpoint-1030/ "/content/drive/MyDrive/Colab Notebooks/DSC 253 - Adv Data-Driven Text Mining/Challenge/bert-finetuned/"

In [None]:
# Free up all CUDA memory
import torch
import gc
# del model
try:
    del trainer
except:
    pass
torch.cuda.empty_cache()
gc.collect()
!nvidia-smi

Mon May 20 08:42:08 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   72C    P0              37W /  72W |   2539MiB / 23034MiB |     56%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### Version 1

In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, BertModel
from sklearn.metrics import f1_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

class BERTWithAdditionalFeatures(nn.Module):
    def __init__(self, num_additional_features, num_labels):
        super(BERTWithAdditionalFeatures, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.additional_features_layer = nn.Sequential(
            nn.Linear(num_additional_features, 128),
            nn.ReLU(),
            # nn.Linear(128, 128),
            # nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.final_layer = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size + 64, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, num_labels),
            nn.Softmax(dim=1)
        )
        self.loss_fct = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, additional_features, labels):
        # print("input_ids", input_ids.shape, input_ids.dtype)
        # print("attention_mask", attention_mask.shape, attention_mask.dtype)
        # print("additional_features", additional_features.shape, additional_features.dtype)
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_output = bert_outputs.last_hidden_state[:, 0, :]  # Take the [CLS] token
        # pooled_output = bert_outputs[1]  # Pooler output
        # print("pooled_output", pooled_output.shape, pooled_output.dtype)
        # print("bert_output", bert_output.shape, bert_output.dtype)
        additional_features_output = self.additional_features_layer(additional_features)
        # print("additional_features_output", additional_features_output.shape, additional_features_output.dtype)
        combined_output = torch.cat((bert_output, additional_features_output), dim=1)
        logits = self.final_layer(combined_output)
        # print("logits", logits.shape, logits.dtype)
        # print("labels", labels.shape, labels.dtype)
        loss = self.loss_fct(logits, labels)
        return loss, logits

# Define a custom dataset
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, additional_features, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.additional_features = additional_features.values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        label = self.labels[item]
        additional_features = self.additional_features[item]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long),
            'additional_features': torch.tensor(additional_features, dtype=torch.float)
        }

# Create dataset instances
num_labels = len(labels.unique())-1
num_addition_columns = len(final_additional_features.columns)
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = ReviewDataset(
    reviews=train_data['merged_review'].tolist(),
    labels=train_data['label_encoded'].tolist(),
    additional_features=additional_features_train,
    tokenizer=tokenizer,
    max_len=128
)

val_dataset = ReviewDataset(
    reviews=val_data['merged_review'].tolist(),
    labels=val_data['label_encoded'].tolist(),
    additional_features=additional_features_val,
    tokenizer=tokenizer,
    max_len=128
)


# Define training arguments
training_args = TrainingArguments(
    output_dir='./bert-finetuned',
    num_train_epochs=20,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    dataloader_num_workers=16,
    weight_decay=0.05,  # Add weight decay for regularization
)

# Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='micro')
    return {
        'f1 micro': f1,
    }

# Initialize Trainer
trainer = Trainer(
    model=BERTWithAdditionalFeatures(num_additional_features=num_addition_columns, num_labels=num_labels),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    # tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model if we are not using a saved model
continue_epoch = 0
saved_model_path = os.path.join(data_path, 'bert-finetuned', f'checkpoint-epoch-{continue_epoch}')
use_saved_model = os.path.exists(saved_model_path)
if continue_epoch < training_args.num_train_epochs:
    if use_saved_model:
        print(f"Continuing training from epoch {continue_epoch}")
        trainer.train(resume_from_checkpoint=saved_model_path)
    else:
        print("Training a new model")
        # Train the model with exception handling
        trainer.train()

    if False:
        checkpoint_dir = os.path.join(data_path, 'bert-finetuned', f'checkpoint-epoch-{int(trainer.state.epoch)}')
        trainer.save_model(checkpoint_dir)
        tokenizer.save_pretrained(checkpoint_dir)
        print(f"Saved model checkpoint to {checkpoint_dir}")
else:
    if use_saved_model:
        print("Loaded saved model from", saved_model_path)
        trainer.load(resume_from_checkpoint=saved_model_path)
    else:
        raise FileNotFoundError(f"Modele not found at {saved_model_path}")

# Evaluate the model on the validation set
eval_results = trainer.evaluate(eval_dataset=val_dataset)

# Print the F1 score
print(f"Validation F1 Score: {eval_results['eval_f1 micro']:.4f}")


Training a new model


  self.pid = os.fork()


Epoch,Training Loss,Validation Loss,F1 micro
1,No log,1.7543,0.740205
2,1.932400,1.726681,0.743248
3,1.734700,1.713898,0.750475
4,1.709400,1.711406,0.750856
5,1.696600,1.718499,0.743248
6,1.696600,1.701486,0.763028
7,1.692300,1.708133,0.75504
8,1.684100,1.706736,0.75466
9,1.672300,1.704142,0.757322


  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


Validation F1 Score: 0.7630


### Version 2

In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, BertModel
from sklearn.metrics import f1_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

class BERTWithAdditionalFeatures(nn.Module):
    def __init__(self, num_additional_features, num_labels):
        super(BERTWithAdditionalFeatures, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.additional_features_layer = nn.Sequential(
            nn.Linear(num_additional_features, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            # nn.Linear(128, 64),
            # nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.final_layer = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size + 128, 256),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, num_labels),
            nn.Softmax(dim=1)
        )
        self.loss_fct = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, additional_features, labels):
        # print("input_ids", input_ids.shape, input_ids.dtype)
        # print("attention_mask", attention_mask.shape, attention_mask.dtype)
        # print("additional_features", additional_features.shape, additional_features.dtype)
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_output = bert_outputs.last_hidden_state[:, 0, :]  # Take the [CLS] token
        # pooled_output = bert_outputs[1]  # Pooler output
        # print("pooled_output", pooled_output.shape, pooled_output.dtype)
        # print("bert_output", bert_output.shape, bert_output.dtype)
        additional_features_output = self.additional_features_layer(additional_features)
        # print("additional_features_output", additional_features_output.shape, additional_features_output.dtype)
        combined_output = torch.cat((bert_output, additional_features_output), dim=1)
        logits = self.final_layer(combined_output)
        # print("logits", logits.shape, logits.dtype)
        # print("labels", labels.shape, labels.dtype)
        loss = self.loss_fct(logits, labels)
        return loss, logits

# Define a custom dataset
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, additional_features, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.additional_features = additional_features.values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        label = self.labels[item]
        additional_features = self.additional_features[item]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long),
            'additional_features': torch.tensor(additional_features, dtype=torch.float)
        }

# Create dataset instances
num_labels = len(labels.unique())-1
num_addition_columns = len(final_additional_features.columns)
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = ReviewDataset(
    reviews=train_data['merged_review'].tolist(),
    labels=train_data['label_encoded'].tolist(),
    additional_features=additional_features_train,
    tokenizer=tokenizer,
    max_len=128
)

val_dataset = ReviewDataset(
    reviews=val_data['merged_review'].tolist(),
    labels=val_data['label_encoded'].tolist(),
    additional_features=additional_features_val,
    tokenizer=tokenizer,
    max_len=128
)


# Define training arguments
training_args = TrainingArguments(
    output_dir='./bert-finetuned',
    num_train_epochs=20,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    dataloader_num_workers=16,
    weight_decay=0.05,  # Add weight decay for regularization
)

# Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='micro')
    return {
        'f1 micro': f1,
    }

# Initialize Trainer
trainer = Trainer(
    model=BERTWithAdditionalFeatures(num_additional_features=num_addition_columns, num_labels=num_labels),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    # tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model if we are not using a saved model
continue_epoch = 0
saved_model_path = os.path.join(data_path, 'bert-finetuned', f'checkpoint-epoch-{continue_epoch}')
use_saved_model = os.path.exists(saved_model_path)
if continue_epoch < training_args.num_train_epochs:
    if use_saved_model:
        print(f"Continuing training from epoch {continue_epoch}")
        trainer.train(resume_from_checkpoint=saved_model_path)
    else:
        print("Training a new model")
        # Train the model with exception handling
        trainer.train()

    if False:
        checkpoint_dir = os.path.join(data_path, 'bert-finetuned', f'checkpoint-epoch-{int(trainer.state.epoch)}')
        trainer.save_model(checkpoint_dir)
        tokenizer.save_pretrained(checkpoint_dir)
        print(f"Saved model checkpoint to {checkpoint_dir}")
else:
    if use_saved_model:
        print("Loaded saved model from", saved_model_path)
        trainer.load(resume_from_checkpoint=saved_model_path)
    else:
        raise FileNotFoundError(f"Modele not found at {saved_model_path}")

# Evaluate the model on the validation set
eval_results = trainer.evaluate(eval_dataset=val_dataset)

# Print the F1 score
print(f"Validation F1 Score: {eval_results['eval_f1 micro']:.4f}")


Training a new model


  self.pid = os.fork()


Epoch,Training Loss,Validation Loss,F1 micro
1,No log,1.857843,0.675922
2,2.073000,1.760361,0.739064
3,1.782400,1.715658,0.753138
4,1.715800,1.714512,0.750095
5,1.704300,1.708216,0.753138
6,1.704300,1.707057,0.75504
7,1.694900,1.704361,0.756561
8,1.692300,1.70604,0.75542
9,1.687300,1.705354,0.756181
10,1.681600,1.71035,0.750475


  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


Validation F1 Score: 0.7566


### XGBoost

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score

# Convert the datasets into DMatrix, the internal data structure that XGBoost uses
train_dmatrix = xgb.DMatrix(data=additional_features_train, label=train_data['label_encoded'].tolist())
val_dmatrix = xgb.DMatrix(data=additional_features_val, label=val_data['label_encoded'].tolist())

# Define the parameters for the XGBoost model
params = {
    'objective': 'multi:softmax',  # For multi-class classification
    'num_class': num_labels,  # Number of classes
    'eval_metric': 'mlogloss',  # Default evaluation metric
    'eta': 0.1,  # Learning rate
    'max_depth': 6  # Depth of the trees
}

# Define a custom evaluation metric function
def f1_micro_eval(preds, dtrain):
    labels = dtrain.get_label()
    preds = preds.argmax(-1)
    # print(preds)
    f1_micro = f1_score(labels, preds, average='micro')
    return 'f1_micro', f1_micro

# Train the model with the custom evaluation metric
num_rounds = 1000  # Number of boosting rounds
watchlist = [(train_dmatrix, 'train'), (val_dmatrix, 'eval')]
bst = xgb.train(params, train_dmatrix, num_rounds, watchlist, early_stopping_rounds=10, feval=f1_micro_eval, maximize=True)

# Predict the labels of the validation set
y_pred = bst.predict(val_dmatrix).astype(int)

# Calculate evaluation metrics
accuracy = accuracy_score(val_data['label_encoded'].tolist(), y_pred)
f1 = f1_score(val_data['label_encoded'].tolist(), y_pred, average='micro')

print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation F1 Score: {f1:.4f}")


[0]	train-mlogloss:2.19266	train-f1_micro:0.47646	eval-mlogloss:2.21455	eval-f1_micro:0.40396
[1]	train-mlogloss:2.10368	train-f1_micro:0.49853	eval-mlogloss:2.14396	eval-f1_micro:0.40510
[2]	train-mlogloss:2.02927	train-f1_micro:0.50832	eval-mlogloss:2.08493	eval-f1_micro:0.41765
[3]	train-mlogloss:1.96298	train-f1_micro:0.51859	eval-mlogloss:2.03397	eval-f1_micro:0.42031
[4]	train-mlogloss:1.90576	train-f1_micro:0.52715	eval-mlogloss:1.98967	eval-f1_micro:0.42488
[5]	train-mlogloss:1.85544	train-f1_micro:0.53400	eval-mlogloss:1.95199	eval-f1_micro:0.42944




[6]	train-mlogloss:1.80870	train-f1_micro:0.54037	eval-mlogloss:1.91721	eval-f1_micro:0.43096
[7]	train-mlogloss:1.76767	train-f1_micro:0.54218	eval-mlogloss:1.88701	eval-f1_micro:0.43134
[8]	train-mlogloss:1.72792	train-f1_micro:0.55055	eval-mlogloss:1.85782	eval-f1_micro:0.43362
[9]	train-mlogloss:1.69198	train-f1_micro:0.55473	eval-mlogloss:1.83223	eval-f1_micro:0.43857
[10]	train-mlogloss:1.65881	train-f1_micro:0.56110	eval-mlogloss:1.80863	eval-f1_micro:0.44199
[11]	train-mlogloss:1.62772	train-f1_micro:0.56167	eval-mlogloss:1.78616	eval-f1_micro:0.44389
[12]	train-mlogloss:1.59766	train-f1_micro:0.56814	eval-mlogloss:1.76598	eval-f1_micro:0.44618
[13]	train-mlogloss:1.57025	train-f1_micro:0.57223	eval-mlogloss:1.74737	eval-f1_micro:0.44922
[14]	train-mlogloss:1.54474	train-f1_micro:0.57708	eval-mlogloss:1.73030	eval-f1_micro:0.44922
[15]	train-mlogloss:1.52023	train-f1_micro:0.57974	eval-mlogloss:1.71493	eval-f1_micro:0.44656
[16]	train-mlogloss:1.49677	train-f1_micro:0.58526	eva

### TF-IDF

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
import string

ps = PorterStemmer()
def preprocess_df(df, stemming=False):
    # get English stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.add('would')
    stop_words.add('The')
    # prepare translation table to translate punctuation to space
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    preprocessed_sentences = []
    for i, row in df.iterrows():
        sent = row["merged_review"]
        sent_nopuncts = sent.translate(translator)
        words_list = sent_nopuncts.strip().split()
        if stemming == True:
            words_list = [ps.stem(word) for word in words_list]
        filtered_words = [word for word in words_list if word not in stop_words and len(word) != 1] # also skip space from above translation
        preprocessed_sentences.append(" ".join(filtered_words))
    df["text"] = preprocessed_sentences
    return df

preprocess_df(data_encoded)

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=True,
                        preprocessor=None,  # applied preprocessor in Data Cleaning
                        tokenizer=word_tokenize,
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True,
                        stop_words= 'english',
                        max_df=0.4,
                        sublinear_tf=True)

X =  tfidf.fit_transform(data_encoded["text"])
train_text = X[train_idx]
val_text = X[val_idx]



In [None]:
train_text.shape, val_text.shape, X.shape

((10515, 84939), (2629, 84939), (23144, 84939))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

clf = LogisticRegression(max_iter=100000000, C=5, class_weight='balanced').fit(train_text, train_data['label_encoded'].tolist())
f1_score(val_data['label_encoded'].tolist(), clf.predict(val_text), average='micro')

0.8113351084062381

In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, BertModel
from sklearn.metrics import f1_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# pretrained_name = 'karimbkh/BERT_fineTuned_Sentiment_Classification_Yelp'
pretrained_name = 'bert-base-uncased'

class BERTWithAdditionalFeatures(nn.Module):
    def __init__(self, num_additional_features, num_labels):
        super(BERTWithAdditionalFeatures, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_name)
        self.additional_features_layer = nn.Sequential(
            nn.Linear(num_additional_features, 128),
            nn.ReLU(),
            # nn.Linear(128, 128),
            # nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.tfidf_layer = nn.Sequential(
            nn.Linear(X.shape[1], 128),
            nn.ReLU(),
            # nn.Linear(128, 128),
            # nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            # nn.Dropout(0.1)
        )
        self.final_layer = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size+ 64 + 64, 64),
            nn.ReLU(),
            # nn.Dropout(0.3),
            nn.Linear(64, num_labels),
            nn.Softmax(dim=1)
        )
        self.loss_fct = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, additional_features, text_input, labels):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_output = bert_outputs.last_hidden_state[:, 0, :]  # Take the [CLS] token

        additional_features_output = self.additional_features_layer(additional_features)
        text_output = self.tfidf_layer(text_input)

        # print(self.tfidf_layer)
        # print("text_input", text_input.shape, text_input.dtype)
        # print("bert_output", bert_output.shape, bert_output.dtype)
        # print("additional_features_output", additional_features_output.shape, additional_features_output.dtype)
        # print("text_output", text_output.shape, text_output.dtype)
        combined_output = torch.cat((bert_output, additional_features_output, text_output), dim=1)
        logits = self.final_layer(combined_output)

        loss = self.loss_fct(logits, labels)
        return loss, logits

# Define a custom dataset
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, additional_features, text_input, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.additional_features = additional_features.values
        self.text_input = text_input
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        label = self.labels[item]
        additional_features = self.additional_features[item]
        text_input = self.text_input[item]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        # print("additional_features",torch.tensor(additional_features, dtype=torch.float).shape)
        # print("text_input",torch.tensor(text_input.toarray(), dtype=torch.float).shape)
        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long),
            'additional_features': torch.tensor(additional_features, dtype=torch.float),
            'text_input': torch.tensor(text_input.toarray().squeeze(), dtype=torch.float)
        }

# Create dataset instances
num_labels = len(labels.unique())-1
num_addition_columns = len(final_additional_features.columns)
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained(pretrained_name)

train_dataset = ReviewDataset(
    reviews=train_data['merged_review'].tolist(),
    labels=train_data['label_encoded'].tolist(),
    additional_features=additional_features_train,
    text_input=train_text,
    tokenizer=tokenizer,
    max_len=128
)

val_dataset = ReviewDataset(
    reviews=val_data['merged_review'].tolist(),
    labels=val_data['label_encoded'].tolist(),
    additional_features=additional_features_val,
    text_input=val_text,
    tokenizer=tokenizer,
    max_len=128
)


# Define training arguments
training_args = TrainingArguments(
    output_dir='./bert-finetuned',
    num_train_epochs=10,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    dataloader_num_workers=16,
    weight_decay=0.05,  # Add weight decay for regularization
)

# Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='micro')
    return {
        'f1 micro': f1,
    }

# Initialize Trainer
trainer = Trainer(
    model=BERTWithAdditionalFeatures(num_additional_features=num_addition_columns, num_labels=num_labels),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    # tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model if we are not using a saved model
continue_epoch = 0
saved_model_path = os.path.join(data_path, 'bert-finetuned', f'checkpoint-epoch-{continue_epoch}')
use_saved_model = os.path.exists(saved_model_path)
if continue_epoch < training_args.num_train_epochs:
    if use_saved_model:
        print(f"Continuing training from epoch {continue_epoch}")
        trainer.train(resume_from_checkpoint=saved_model_path)
    else:
        print("Training a new model")
        # Train the model with exception handling
        trainer.train()

    if False:
        checkpoint_dir = os.path.join(data_path, 'bert-finetuned', f'checkpoint-epoch-{int(trainer.state.epoch)}')
        trainer.save_model(checkpoint_dir)
        tokenizer.save_pretrained(checkpoint_dir)
        print(f"Saved model checkpoint to {checkpoint_dir}")
else:
    if use_saved_model:
        print("Loaded saved model from", saved_model_path)
        trainer.load(resume_from_checkpoint=saved_model_path)
    else:
        raise FileNotFoundError(f"Modele not found at {saved_model_path}")

# Evaluate the model on the validation set
eval_results = trainer.evaluate(eval_dataset=val_dataset)

# Print the F1 score
print(f"Validation F1 Score: {eval_results['eval_f1 micro']:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Training a new model


  self.pid = os.fork()


Epoch,Training Loss,Validation Loss,F1 micro
1,No log,1.777127,0.746672
2,1.940800,1.724566,0.75542
3,1.733700,1.712593,0.756561
4,1.695700,1.70822,0.757703
5,1.683100,1.699838,0.764549
6,1.683100,1.698514,0.764549
7,1.672700,1.697787,0.764549
8,1.664300,1.694522,0.767973
9,1.653800,1.693531,0.769494
10,1.653200,1.692473,0.769874


  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


Validation F1 Score: 0.7699


In [None]:
import torch
# torch.tensor(train_text[1:4].toarray())
torch.tensor(train_text[1].toarray().squeeze(), dtype=torch.float).shape

torch.Size([83570])

### Joint

In [None]:
fin_train_data = data_encoded.loc[train_index]
fin_test_data = data_encoded.loc[test_index]
additional_features_fin_train = final_additional_features.loc[train_index]
additional_features_fin_test = final_additional_features.loc[test_index]

#### ML

In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, BertModel
from sklearn.metrics import f1_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

class BERTWithAdditionalFeatures(nn.Module):
    def __init__(self, num_additional_features, num_labels):
        super(BERTWithAdditionalFeatures, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.additional_features_layer = nn.Sequential(
            nn.Linear(num_additional_features, 128),
            nn.ReLU(),
            # nn.Linear(128, 128),
            # nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.final_layer = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size + 64, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, num_labels),
            nn.Softmax(dim=1)
        )
        self.loss_fct = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, additional_features, labels):
        # print("input_ids", input_ids.shape, input_ids.dtype)
        # print("attention_mask", attention_mask.shape, attention_mask.dtype)
        # print("additional_features", additional_features.shape, additional_features.dtype)
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_output = bert_outputs.last_hidden_state[:, 0, :]  # Take the [CLS] token
        # pooled_output = bert_outputs[1]  # Pooler output
        # print("pooled_output", pooled_output.shape, pooled_output.dtype)
        # print("bert_output", bert_output.shape, bert_output.dtype)
        additional_features_output = self.additional_features_layer(additional_features)
        # print("additional_features_output", additional_features_output.shape, additional_features_output.dtype)
        combined_output = torch.cat((bert_output, additional_features_output), dim=1)
        logits = self.final_layer(combined_output)
        # print("logits", logits.shape, logits.dtype)
        # print("labels", labels.shape, labels.dtype)
        loss = self.loss_fct(logits, labels)
        return loss, logits

# Define a custom dataset
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, additional_features, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.additional_features = additional_features.values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        label = self.labels[item]
        additional_features = self.additional_features[item]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long),
            'additional_features': torch.tensor(additional_features, dtype=torch.float)
        }

# Create dataset instances
num_labels = len(labels.unique())-1
num_addition_columns = len(final_additional_features.columns)
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = ReviewDataset(
    reviews=fin_train_data['merged_review'].tolist(),
    labels=fin_train_data['label_encoded'].tolist(),
    additional_features=additional_features_fin_train,
    tokenizer=tokenizer,
    max_len=128
)

val_dataset = ReviewDataset(
    reviews=val_data['merged_review'].tolist(),
    labels=val_data['label_encoded'].tolist(),
    additional_features=additional_features_val,
    tokenizer=tokenizer,
    max_len=128
)


# Define training arguments
training_args = TrainingArguments(
    output_dir='./bert-finetuned',
    num_train_epochs=10,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    dataloader_num_workers=16,
    weight_decay=0.05,  # Add weight decay for regularization
)

# Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='micro')
    return {
        'f1 micro': f1,
    }

# Initialize Trainer
trainer = Trainer(
    model=BERTWithAdditionalFeatures(num_additional_features=num_addition_columns, num_labels=num_labels),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    # tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model if we are not using a saved model
continue_epoch = 10
saved_model_path = os.path.join(data_path, 'bert-finetuned', f'checkpoint-epoch-{continue_epoch}')
use_saved_model = os.path.exists(saved_model_path)
if continue_epoch < training_args.num_train_epochs:
    if use_saved_model:
        print(f"Continuing training from epoch {continue_epoch}")
        trainer.train(resume_from_checkpoint=saved_model_path)
    else:
        print("Training a new model")
        # Train the model with exception handling
        trainer.train()

    if False:
        checkpoint_dir = os.path.join(data_path, 'bert-finetuned', f'checkpoint-epoch-{int(trainer.state.epoch)}')
        trainer.save_model(checkpoint_dir)
        tokenizer.save_pretrained(checkpoint_dir)
        print(f"Saved model checkpoint to {checkpoint_dir}")
else:
    if use_saved_model:
        print("Loaded saved model from", saved_model_path)
        trainer.train(resume_from_checkpoint=saved_model_path)
    else:
        raise FileNotFoundError(f"Modele not found at {saved_model_path}")

# Evaluate the model on the validation set
eval_results = trainer.evaluate(eval_dataset=val_dataset)

# Print the F1 score
print(f"Validation F1 Score: {eval_results['eval_f1 micro']:.4f}")


Loaded saved model from /content/drive/MyDrive/Colab Notebooks/DSC 253 - Adv Data-Driven Text Mining/Challenge/bert-finetuned/checkpoint-epoch-10


Epoch,Training Loss,Validation Loss


  self.pid = os.fork()


Validation F1 Score: 0.8235


In [None]:
test_dataset = ReviewDataset(
    reviews=fin_test_data['merged_review'].tolist(),
    labels=np.zeros(fin_test_data['label_encoded'].shape),
    additional_features=additional_features_fin_test,
    tokenizer=tokenizer,
    max_len=128
)

output_ml = trainer.predict(test_dataset)
# pred_ml = output_ml.predictions.argmax(-1).astype(int)
pred_ml = output_ml.predictions

  self.pid = os.fork()


#### XG

In [None]:
# val_dmatrix = xgb.DMatrix(data=additional_features_val, label=val_data['label_encoded'].tolist())
# # pred_xg = bst.predict(val_dmatrix).astype(int)
# pred_xg = bst.predict(val_dmatrix, output_margin=False)

#### TFIDF

In [None]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
import string

ps = PorterStemmer()
def preprocess_df(df, stemming=False):
    # get English stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.add('would')
    stop_words.add('The')
    # prepare translation table to translate punctuation to space
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    preprocessed_sentences = []
    for i, row in df.iterrows():
        sent = row["merged_review"]
        sent_nopuncts = sent.translate(translator)
        words_list = sent_nopuncts.strip().split()
        if stemming == True:
            words_list = [ps.stem(word) for word in words_list]
        filtered_words = [word for word in words_list if word not in stop_words and len(word) != 1] # also skip space from above translation
        preprocessed_sentences.append(" ".join(filtered_words))
    df["text"] = preprocessed_sentences
    return df

preprocess_df(data_encoded)

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=True,
                        preprocessor=None,  # applied preprocessor in Data Cleaning
                        tokenizer=word_tokenize,
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True,
                        stop_words= 'english',
                        max_df=0.4,
                        sublinear_tf=True)

X =  tfidf.fit_transform(data_encoded["text"])
train_text = X[train_index]
test_text = X[test_index]



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

clf = LogisticRegression(max_iter=100000000, C=5, class_weight='balanced').fit(train_text, fin_train_data['label_encoded'].tolist())
pred_lr = clf.predict_proba(test_text)

#### Join

In [None]:
import numpy as np

# Probabilities for sampling
probabilities = [0.4, 0.6]

pred_fin = pred_ml*probabilities[0] + pred_lr*probabilities[1]
final_predictions = pred_fin.argmax(-1).astype(int)

print("Final Predictions:", final_predictions)

Final Predictions: [0 9 9 ... 1 1 3]


In [None]:
label_final_predictions = label_encoder.inverse_transform(final_predictions)
label_final_predictions

array(['american (new)', 'thai', 'thai', ..., 'american (traditional)',
       'american (traditional)', 'canadian (new)'], dtype=object)

In [None]:
dic = {"Id": [], "Predicted": []}
for i, pred in enumerate(label_final_predictions):
    dic["Id"].append(i)
    dic["Predicted"].append(pred)

dic_df = pd.DataFrame.from_dict(dic)
dic_df.to_csv(data_path + "predicted.csv", index=False)

In [None]:
print(pred_ml.sum(1))
print(pred_lr.sum(1))
data_path + "predicted.csv"

[1.         1.0000001  0.99999994 ... 1.         1.0000001  1.        ]
[1. 1. 1. ... 1. 1. 1.]


'/content/drive/MyDrive/Colab Notebooks/DSC 253 - Adv Data-Driven Text Mining/Challenge/predicted.csv'

### Final

#### Prepare data

In [86]:
fin_train_data = data_encoded.loc[train_index]
fin_test_data = data_encoded.loc[test_index]
additional_features_fin_train = final_additional_features.loc[train_index]
additional_features_fin_test = final_additional_features.loc[test_index]
label_column = 'label_encoded'
text_column = 'merged_review'
name_column = 'name'

if not True:
    # validation
    ml_train = train_data
    ml_val = val_data
    ml_test = fin_test_data

    add_feat_train = additional_features_train
    add_feat_val = additional_features_val
    add_feat_test = additional_features_fin_test

    tfidf_train_index = train_idx
    tfidf_val_index = val_idx
    tfidf_test_index = test_index

    train_labels = data_encoded.loc[train_idx, label_column]
    val_labels = data_encoded.loc[val_idx, label_column]
    test_labels = data_encoded.loc[test_index, label_column]

else:
    # final
    ml_train = fin_train_data
    ml_val = val_data
    ml_test = fin_test_data

    add_feat_train = additional_features_fin_train
    add_feat_val = additional_features_val
    add_feat_test = additional_features_fin_test

    tfidf_train_index = train_index
    tfidf_val_index = val_idx
    tfidf_test_index = test_index

    train_labels = data_encoded.loc[train_index, label_column]
    val_labels = data_encoded.loc[val_idx, label_column]
    test_labels = data_encoded.loc[test_index, label_column]

#### TFIDF

In [87]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
import string

ps = PorterStemmer()
def preprocess_df(df, stemming=False):
    # get English stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.add('would')
    stop_words.add('The')
    # prepare translation table to translate punctuation to space
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    preprocessed_sentences = []
    for i, row in df.iterrows():
        sent = row[text_column]
        sent_nopuncts = sent.translate(translator)
        words_list = sent_nopuncts.strip().split()
        if stemming == True:
            words_list = [ps.stem(word) for word in words_list]
        filtered_words = [word for word in words_list if word not in stop_words and len(word) != 1] # also skip space from above translation
        preprocessed_sentences.append(" ".join(filtered_words))
    df["text"] = preprocessed_sentences
    return df

preprocess_df(data_encoded)

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=True,
                        preprocessor=None,  # applied preprocessor in Data Cleaning
                        tokenizer=word_tokenize,
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True,
                        stop_words= 'english',
                        max_df=0.4,
                        sublinear_tf=True)

X =  tfidf.fit_transform(data_encoded["text"])
train_text = X[tfidf_train_index]
val_text = X[tfidf_val_index]
test_text = X[tfidf_test_index]

In [88]:
train_text.shape, val_text.shape, test_text.shape

((13144, 84939), (2629, 84939), (10000, 84939))

In [89]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

clf = LogisticRegression(max_iter=100000000, C=5, class_weight='balanced').fit(train_text, train_labels.tolist())
print("F1 micro:", f1_score(val_labels.tolist(), clf.predict(val_text), average='micro'))
train_pred_lr = clf.predict_proba(train_text)
val_pred_lr = clf.predict_proba(val_text)
test_pred_lr = clf.predict_proba(test_text)

F1 micro: 0.9844047166222899


In [90]:
train_pred_lr.shape, val_pred_lr.shape, test_pred_lr.shape

((13144, 10), (2629, 10), (10000, 10))

#### Random Forest

In [91]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=300, random_state=42)

# Train the model
rf_model.fit(add_feat_train, train_labels.tolist())

# Validate the model
val_predictions = rf_model.predict(add_feat_val)

# Calculate F1 micro score
f1_micro = f1_score(val_labels.tolist(), val_predictions, average='micro')

print(f"Validation F1 Micro Score: {f1_micro:.4f}")

# Predict on the test set
train_pred_rf = rf_model.predict_proba(add_feat_train)
val_pred_rf = rf_model.predict_proba(add_feat_val)
test_pred_rf = rf_model.predict_proba(add_feat_test)

Validation F1 Micro Score: 1.0000


In [92]:
train_pred_rf.shape, val_pred_rf.shape, test_pred_rf.shape

((13144, 10), (2629, 10), (10000, 10))

#### XGBoost

In [93]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

# Initialize the XGBoost classifier
xgb_model = XGBClassifier(n_estimators=400, random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Train the model
xgb_model.fit(add_feat_train, train_labels.tolist())

# Validate the model
val_predictions = xgb_model.predict(add_feat_val)

# Calculate F1 micro score
f1_micro = f1_score(val_labels.tolist(), val_predictions, average='micro')
print(f"Validation F1 Micro Score: {f1_micro:.4f}")

# Predict probabilities on the validation set
train_pred_xgb = xgb_model.predict_proba(add_feat_train)
val_pred_xgb = xgb_model.predict_proba(add_feat_val)
test_pred_xgb = xgb_model.predict_proba(add_feat_test)

Validation F1 Micro Score: 1.0000


In [94]:
train_pred_xgb.shape, val_pred_xgb.shape, test_pred_xgb.shape

((13144, 10), (2629, 10), (10000, 10))

#### BERTopic

In [95]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.linear_model import LogisticRegression

# Skip over dimensionality reduction, replace cluster model with classifier,
# and reduce frequent words while we are at it.
empty_dimensionality_model = BaseDimensionalityReduction()
clf = LogisticRegression()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Create a fully supervised BERTopic instance
topic_model= BERTopic(
        umap_model=empty_dimensionality_model,
        hdbscan_model=clf,
        ctfidf_model=ctfidf_model,
        calculate_probabilities=True
)
topics, probs = topic_model.fit_transform(ml_train[text_column].tolist(), y=train_labels.tolist())

In [96]:
try_val , _ = topic_model.transform(ml_val[text_column].tolist())
# topics_test, probs_test = topic_model.transform(ml_val[text_column].tolist())

mappings = topic_model.topic_mapper_.get_mappings()
mappings = {value: key for key, value in mappings.items()}

print("F1 micro:", f1_score(val_labels.tolist(), list(map(lambda x: mappings[x], try_val)), average='micro'))

F1 micro: 0.7877519969570179


In [97]:
topic_model.save("./bertopic", serialization="pytorch", save_ctfidf=True)
new_topic_model = BERTopic.load("./bertopic")

In [98]:
topics_train, probs_train = new_topic_model.transform(ml_train[text_column].tolist())
topics_val, probs_val = new_topic_model.transform(ml_val[text_column].tolist())
topics_test, probs_test = new_topic_model.transform(ml_test[text_column].tolist())

print("F1 micro:", f1_score(val_labels.tolist(), list(map(lambda x: mappings[x], topics_val)), average='micro'))

F1 micro: 0.7151007987828071


In [99]:
if False:
    old_topics_val, _ = topic_model.transform(ml_val[text_column].tolist())
    print("F1 micro:", f1_score(val_labels.tolist(), list(map(lambda x: mappings[x], old_topics_val)), average='micro'))

    map_dict = dict()
    for i in range(len(old_topics_val)):
        if old_topics_val[i] not in map_dict:
            map_dict[old_topics_val[i]]=set()
            map_dict[old_topics_val[i]].add(topics_val[i])
        else:
            map_dict[old_topics_val[i]].add(topics_val[i])
    map_dict
# type(topics_val)

In [100]:
def permute(array, mapping):
    # Create a permutation array based on the mapping
    num_columns = array.shape[1]
    permutation = np.zeros(num_columns, dtype=int)

    for old_index, new_index in mapping.items():
        permutation[new_index] = old_index

    # Reorder the columns of the array based on the permutation
    permuted_array = array[:, permutation]
    return permuted_array

train_pred_bt = permute(probs_train, mappings)
val_pred_bt = permute(probs_val, mappings)
test_pred_bt = permute(probs_test, mappings)

tmp_predictions = val_pred_bt.argmax(-1).astype(int)
print("F1 micro:", f1_score(val_labels.tolist(), tmp_predictions, average='micro'))

F1 micro: 0.7151007987828071


In [101]:
train_pred_bt.shape, val_pred_bt.shape, test_pred_bt.shape

((13144, 10), (2629, 10), (10000, 10))

#### Joint to ML

In [102]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, BertModel
from sklearn.metrics import f1_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

class BERTWithAdditionalFeatures(nn.Module):
    def __init__(self, num_additional_features, num_labels, num_models):
        super(BERTWithAdditionalFeatures, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.additional_features_layer = nn.Sequential(
            nn.Linear(num_additional_features, 128),
            nn.ReLU(),
            # nn.Linear(128, 128),
            # nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.final_layer = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size + 64, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, num_labels),
            nn.Softmax(dim=1)
        )
        # self.weights = nn.Parameter(torch.ones(num_models, 1) / num_models)
        self.weights = nn.Parameter(torch.tensor([0.55,0.05,0.05,0.05,0.3]).view(5, 1))
        self.loss_fct = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, additional_features, pred_prob, labels):
        # print("input_ids", input_ids.shape, input_ids.dtype)
        # print("attention_mask", attention_mask.shape, attention_mask.dtype)
        # print("additional_features", additional_features.shape, additional_features.dtype)
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_output = bert_outputs.last_hidden_state[:, 0, :]  # Take the [CLS] token
        # pooled_output = bert_outputs[1]  # Pooler output
        # print("pooled_output", pooled_output.shape, pooled_output.dtype)
        # print("bert_output", bert_output.shape, bert_output.dtype)
        additional_features_output = self.additional_features_layer(additional_features)
        # print("additional_features_output", additional_features_output.shape, additional_features_output.dtype)
        combined_output = torch.cat((bert_output, additional_features_output), dim=1)
        logits = self.final_layer(combined_output)
        # print("logits", logits.shape, logits.dtype)
        # print("labels", labels.shape, labels.dtype)
        torch_pred_probs = torch.cat((pred_prob, logits.unsqueeze(2)), dim=2)
        # print("torch_pred_probs", torch_pred_probs.shape, torch_pred_probs.dtype)
        weighted_sum = torch.matmul(torch_pred_probs, self.weights/torch.sum(self.weights)).squeeze()
        # print("weighted_sum", weighted_sum.shape, weighted_sum.dtype)
        loss = self.loss_fct(weighted_sum, labels)
        return loss, weighted_sum

# Define a custom dataset
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, additional_features, pred_probs, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.additional_features = additional_features.values
        self.pred_probs = pred_probs
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        label = self.labels[item]
        additional_features = self.additional_features[item]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        # print([torch.tensor(pred_prob[item], dtype=torch.float) for pred_prob in self.pred_probs])
        merged_prob = [torch.tensor(pred_prob[item], dtype=torch.float) for pred_prob in self.pred_probs]
        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long),
            'pred_prob': torch.stack(merged_prob, dim=1),
            'additional_features': torch.tensor(additional_features, dtype=torch.float)
        }

# Create dataset instances
num_labels = len(labels.unique())-1
num_addition_columns = len(final_additional_features.columns)
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = ReviewDataset(
    reviews=ml_train[text_column].tolist(),
    labels=ml_train[label_column].tolist(),
    additional_features=additional_features_fin_train,
    # pred_probs=[train_pred_lr, train_pred_lr, train_pred_lr, train_pred_lr],
    pred_probs=[train_pred_lr, train_pred_rf, train_pred_xgb, train_pred_bt],
    tokenizer=tokenizer,
    max_len=128
)

val_dataset = ReviewDataset(
    reviews=ml_val[text_column].tolist(),
    labels=ml_val[label_column].tolist(),
    additional_features=additional_features_val,
    # pred_probs=[val_pred_lr, val_pred_lr, val_pred_lr, val_pred_lr],
    pred_probs=[val_pred_lr, val_pred_rf, val_pred_xgb, val_pred_bt],
    tokenizer=tokenizer,
    max_len=128
)


# Define training arguments
training_args = TrainingArguments(
    output_dir='./bert-finetuned',
    num_train_epochs=10,
    # learning_rate=5e-04,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    dataloader_num_workers=16,
    weight_decay=0.05,  # Add weight decay for regularization
)

# Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='micro')
    return {
        'f1 micro': f1,
    }

# Initialize Trainer
trainer = Trainer(
    model=BERTWithAdditionalFeatures(num_additional_features=num_addition_columns, num_labels=num_labels, num_models=5),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    # tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model if we are not using a saved model
continue_epoch = 0
saved_model_path = os.path.join(data_path, 'bert-finetuned', f'checkpoint-epoch-{continue_epoch}')
use_saved_model = os.path.exists(saved_model_path)
if continue_epoch < training_args.num_train_epochs:
    if use_saved_model:
        print(f"Continuing training from epoch {continue_epoch}")
        trainer.train(resume_from_checkpoint=saved_model_path)
    else:
        print("Training a new model")
        # Train the model with exception handling
        trainer.train()

    if False:
        checkpoint_dir = os.path.join(data_path, 'bert-finetuned', f'checkpoint-epoch-{int(trainer.state.epoch)}')
        trainer.save_model(checkpoint_dir)
        tokenizer.save_pretrained(checkpoint_dir)
        print(f"Saved model checkpoint to {checkpoint_dir}")
else:
    if use_saved_model:
        print("Loaded saved model from", saved_model_path)
        trainer.train(resume_from_checkpoint=saved_model_path)
    else:
        raise FileNotFoundError(f"Modele not found at {saved_model_path}")

# Evaluate the model on the validation set
eval_results = trainer.evaluate(eval_dataset=val_dataset)

# Print the F1 score
print(f"Validation F1 Score: {eval_results['eval_f1 micro']:.4f}")


Training a new model


  self.pid = os.fork()


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,F1 micro
1,1.7318,1.665236,0.969951
2,1.6644,1.65176,0.96919
3,1.6531,1.645415,0.968049
4,1.6481,1.642838,0.96919
5,1.643,1.638803,0.96919
6,1.6397,1.635736,0.969951
7,1.6369,1.634133,0.971092
8,1.6367,1.632701,0.972613
9,1.6327,1.631404,0.974135
10,1.6325,1.630905,0.973754


  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


Validation F1 Score: 0.9738


In [103]:
print(trainer.model.weights)

Parameter containing:
tensor([[0.5631],
        [0.0395],
        [0.0708],
        [0.0231],
        [0.2894]], device='cuda:0', requires_grad=True)


In [104]:
test_dataset = ReviewDataset(
    reviews=ml_test[text_column].tolist(),
    labels=np.zeros(ml_test[label_column].shape).tolist(),
    additional_features=additional_features_fin_test,
    pred_probs=[test_pred_lr, test_pred_rf, test_pred_xgb, test_pred_bt],
    tokenizer=tokenizer,
    max_len=128
)

output_ml = trainer.predict(test_dataset)
# test_pred_ml = output_ml.predictions.argmax(-1).astype(int)
test_pred_ml = output_ml.predictions

  self.pid = os.fork()


In [None]:
label_final_predictions = label_encoder.inverse_transform(test_pred_ml.argmax(-1).astype(int))
dic = {"Id": [], "Predicted": []}
for i, pred in enumerate(label_final_predictions):
    dic["Id"].append(i)
    dic["Predicted"].append(pred)

dic_df = pd.DataFrame.from_dict(dic)
dic_df.to_csv(data_path + "predicted.csv", index=False)

#### Simple Joint

In [None]:
val_pred_ml = trainer.predict(val_dataset).predictions

In [107]:
import numpy as np

# Probabilities for sampling
assign_weight = [0.5,0.05,0.05,0.1,0.3]
prob = [val_pred_lr, val_pred_rf, val_pred_xgb, val_pred_bt, val_pred_ml]

pred_fin = 0
for i in range(len(assign_weight)):
    pred_fin+=prob[i]*assign_weight[i]

final_predictions = pred_fin.argmax(-1).astype(int)

print("Final Predictions:", final_predictions)

print("F1 micro:", f1_score(val_labels.tolist(), final_predictions, average='micro'))

Final Predictions: [0 6 8 ... 0 8 0]
F1 micro: 0.9920121719284899


In [115]:
import numpy as np

# Probabilities for sampling
assign_weight = [0.55,0.05,0.1,0.05,0.25]
# assign_weight = [0.5,0.0,0.0,0.0,0.5]
prob = [test_pred_lr, test_pred_rf, test_pred_xgb, test_pred_bt, test_pred_ml]

pred_fin = 0
for i in range(len(assign_weight)):
    pred_fin+=prob[i]*assign_weight[i]

final_predictions = pred_fin.argmax(-1).astype(int)

print("Final Predictions:", final_predictions)

label_final_predictions = label_encoder.inverse_transform(final_predictions)
dic = {"Id": [], "Predicted": []}
for i, pred in enumerate(label_final_predictions):
    dic["Id"].append(i)
    dic["Predicted"].append(pred)

dic_df = pd.DataFrame.from_dict(dic)
dic_df.to_csv(data_path + "predicted.csv", index=False)

Final Predictions: [3 9 9 ... 1 1 3]


#### Phrase mining (Deprecated)

In [122]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
import string

ps = PorterStemmer()
def preprocess_df(df, stemming=False):
    # get English stopwords
    stop_words = set(stopwords.words('english'))
    # stop_words.add('would')
    # stop_words.add('The')
    # prepare translation table to translate punctuation to space
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    preprocessed_sentences = []
    for i, row in df.iterrows():
        sent = row[name_column]
        sent_nopuncts = sent.translate(translator)
        words_list = sent_nopuncts.strip().split()
        if stemming == True:
            words_list = [ps.stem(word) for word in words_list]
        filtered_words = [word for word in words_list if word not in stop_words and len(word) != 1 and word[0].isupper()] # also skip space from above translation
        preprocessed_sentences.append(" ".join(filtered_words))
    df["name_tokens"] = preprocessed_sentences
    return df

preprocess_df(data_encoded)
data_encoded['name_tokens'].head(20)

0                     Rush Inn
1                     GreenMix
2           BarBurrito Gerrard
3                     SalvaMex
4                     Hop Hing
5         Caramba Mexican Food
6                    Taco Bell
7                     Red Bowl
8            Beiruti Grand Caf
9                Tacos Tequila
10                Pasta Brioni
11           Spaghetti Company
12                       Rubio
13                   Egg Works
14                Thai Noodles
15        Green Tea Restaurant
16     Corrado Cucina Italiana
17                    WingBonz
18           Restaurant El Rey
19    Evergreen Chinese Buffet
Name: name_tokens, dtype: object

In [123]:
empty_name_tokens_indices = data_encoded[data_encoded['name_tokens'].apply(len) == 0].index.tolist()
data_encoded.loc[empty_name_tokens_indices, 'name']

42                           K\xc5\x8djin
130                                 iCook
1264                        S\xc3\xa9same
1473              the taverne of westlake
1887                               notion
2331                                 3030
2797                        pico de gallo
5151                           L'academie
5528                           L'academie
6189                            honeygrow
6402                              barMASA
6534                               kabuku
7798                               osmows
7834                                 505e
9268                            H\xc3\xa0
12281                K\xc5\xab-k\xc5\xadm
12439                        N\xc3\xbcjoy
13036                       hayashi maple
13437                                  3Q
14033                           T\xc3\xa9
14044                           T--o T--e
14390                 sarita a restaurant
15286                            iKitchen
15438                         wien

In [124]:
import pandas as pd
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# Initialize the CountVectorizer
vectorizer = CountVectorizer()
X =  vectorizer.fit_transform(data_encoded["name_tokens"])
train_vec = X[tfidf_train_index]
val_vec = X[tfidf_val_index]
test_vec = X[tfidf_test_index]

# Fit the ClassTfidfTransformer
ctfidf = ClassTfidfTransformer()
X_train_ctfidf = ctfidf.fit_transform(train_vec, y=train_labels)

# Transform the validation and test data
X_val_counts = vectorizer.transform(val_vec)
X_val_ctfidf = ctfidf.transform(X_val_counts)

X_test_counts = vectorizer.transform(test_vec)
X_test_ctfidf = ctfidf.transform(X_test_counts)

  idf = np.log((avg_nr_samples / df)+1)


ValueError: operands could not be broadcast together with shapes (10553,) (10515,) 

In [125]:
import numpy as np
import scipy.sparse as sp

from sklearn.utils import check_array
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted


class CTFIDFVectorizer(TfidfTransformer):
    def __init__(self, *args, **kwargs):
        super(CTFIDFVectorizer, self).__init__(*args, **kwargs)
        self._idf_diag = None

    def fit(self, X: sp.csr_matrix, n_samples: int):
        """Learn the idf vector (global term weights)

        Parameters
        ----------
        X : sparse matrix of shape n_samples, n_features)
            A matrix of term/token counts.

        """

        # Prepare input
        X = check_array(X, accept_sparse=('csr', 'csc'))
        if not sp.issparse(X):
            X = sp.csr_matrix(X)
        dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64

        # Calculate IDF scores
        _, n_features = X.shape
        df = np.squeeze(np.asarray(X.sum(axis=0)))
        avg_nr_samples = int(X.sum(axis=1).mean())
        idf = np.log(avg_nr_samples / df)
        self._idf_diag = sp.diags(idf, offsets=0,
                                  shape=(n_features, n_features),
                                  format='csr',
                                  dtype=dtype)
        return self

    def transform(self, X: sp.csr_matrix, copy=True) -> sp.csr_matrix:
        """Transform a count-based matrix to c-TF-IDF

        Parameters
        ----------
        X : sparse matrix of (n_samples, n_features)
            a matrix of term/token counts

        Returns
        -------
        vectors : sparse matrix of shape (n_samples, n_features)

        """

        # Prepare input
        X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, copy=copy)
        if not sp.issparse(X):
            X = sp.csr_matrix(X, dtype=np.float64)

        n_samples, n_features = X.shape

        # idf_ being a property, the automatic attributes detection
        # does not work as usual and we need to specify the attribute
        # name:
        check_is_fitted(self, attributes=["idf_"],
                        msg='idf vector is not fitted')

        # Check if expected nr features is found
        expected_n_features = self._idf_diag.shape[0]
        if n_features != expected_n_features:
            raise ValueError("Input has n_features=%d while the model"
                             " has been trained with n_features=%d" % (
                                 n_features, expected_n_features))

        X = X * self._idf_diag

        if self.norm:
            X = normalize(X, axis=1, norm='l1', copy=False)

        return X

In [127]:
docs_per_class = data_encoded.groupby([label_column], as_index=False).agg({'name_tokens': ' '.join})

# Create bag of words
count_vectorizer = CountVectorizer().fit(docs_per_class.name_tokens)
count = count_vectorizer.transform(docs_per_class.name_tokens)
# words = count_vectorizer.get_feature_names()

# Extract top 10 words
ctfidf = CTFIDFVectorizer().fit_transform(count, n_samples=len(data_encoded)).toarray()
# words_per_class = {newsgroups.target_names[label]: [words[index] for index in ctfidf[label].argsort()[-10:]] for label in docs_per_class.Class}


In [129]:
ctfidf.shape

(10, 7492)

In [103]:
words

((10515, 10553), 10515)

In [None]:
# Train a classifier on the transformed data
rf_model = RandomForestClassifier(n_estimators=300, random_state=42)
rf_model.fit(X_train_ctfidf, train_df['label'])

# Validate the model
val_predictions = rf_model.predict(X_val_ctfidf)

# Calculate F1 micro score
f1_micro = f1_score(val_df['label'], val_predictions, average='micro')
print(f"Validation F1 Micro Score: {f1_micro:.4f}")

# Predict on the test set
test_predictions = rf_model.predict(X_test_ctfidf)

# Print the predictions
print("Test Predictions:", test_predictions)

#### Word to Vec (Deprecated)

In [64]:
data_encoded['name'].head(20)

0                      Rush Inn
1                      GreenMix
2          BarBurrito - Gerrard
3                      SalvaMex
4                      Hop Hing
5          Caramba Mexican Food
6                     Taco Bell
7                      Red Bowl
8     Beiruti Grand Caf\xc3\xa9
9         T&T - Tacos & Tequila
10                 Pasta Brioni
11          Spaghetti & Company
12                      Rubio's
13                    Egg Works
14                 Thai Noodles
15         Green Tea Restaurant
16    Corrado's Cucina Italiana
17                     WingBonz
18            Restaurant El Rey
19     Evergreen Chinese Buffet
Name: name, dtype: object

In [72]:
from nltk.tokenize import sent_tokenize, word_tokenize

def tokenize(X):
    def sub_tokenize(text):
        # step 1: get sentences
        sentences = sent_tokenize(text)
        # step 2: get tokens
        tokens = []
        for sent in sentences:
            words = word_tokenize(sent)
            # step 3 (optional): stemming
            words = [ps.stem(word) for word in words if word.lower() not in stop]
            tokens += words
        return [w.lower() for w in tokens]

    ps = PorterStemmer()
    stop = set(stopwords.words('english'))
    X_tokens = []
    for text in tqdm(X, desc='Tokenizing'):
        X_tokens.append(sub_tokenize(text))
    return X_tokens

tokenized_name = tokenize(data_encoded['name'])

Tokenizing: 100%|██████████| 23144/23144 [00:04<00:00, 5235.18it/s]


In [81]:
tokenized_name[:20]

[['rush', 'inn'],
 ['greenmix'],
 ['barburrito', '-', 'gerrard'],
 ['salvamex'],
 ['hop', 'hing'],
 ['caramba', 'mexican', 'food'],
 ['taco', 'bell'],
 ['red', 'bowl'],
 ['beiruti', 'grand', 'caf\\xc3\\xa9'],
 ['&', '-', 'taco', '&', 'tequila'],
 ['pasta', 'brioni'],
 ['spaghetti', '&', 'compani'],
 ['rubio', "'s"],
 ['egg', 'work'],
 ['thai', 'noodl'],
 ['green', 'tea', 'restaur'],
 ['corrado', "'s", 'cucina', 'italiana'],
 ['wingbonz'],
 ['restaur', 'el', 'rey'],
 ['evergreen', 'chines', 'buffet']]

In [70]:
import gensim.downloader as api
from gensim.models import KeyedVectors

# Load the pre-trained Word2Vec model from gensim-data
# This might take a while to download if it's the first time
model = api.load('word2vec-google-news-300')

# Flatten the list of lists into a single list of tokens
all_tokens = [token for sublist in tokenized_name for token in sublist]

# Find tokens not in the Word2Vec model
tokens_not_in_model = set(token for token in all_tokens if token not in model.key_to_index)

print("Tokens not in the Word2Vec model:", tokens_not_in_model)


Tokens not in the Word2Vec model: {'foodtruck', 'dimartino', 'oohmami', 'biir\\xc5\\xab', "l'assommoir", 'onyxx', 'purisima', 'novat', 'westcliff', 'caravelle', 'cevicheria', 'cascio', 'mexi-casa', 'karey', 'karin', 'aladdins', 'keiths', 'marimar', 'adolfo', 'vendome', 'kanbai', 'menya', 'cambodge', 'winncie', '22', 'lawry', 'calakmul', 'kierland', 'tibetain', 'lears', 'blaqcat', 'nagomi', 'kristy', 'unionville', 'du-pars', 'sausalito', '808', 'mowry', 'hellriegel', 'thirty-six', 'midwood', 'arata', 'edgewater', 'komeya', 'encanto', 'wakado', 'antica', 'mezcaleros', 'cin-cin', 'kowloon', 'alida', 'wiang', 'wuhan', 'zaba', '63', 'levant', 'farley', 'brixx', 'giacomo', 'rivas', 'spoonz', 'b19', 'langtree', "l'asie", 'johnsons', 'mayan', 'karizma', '911', 'alexion', 'calavittas', 'ichie', 'vincenzos', 'kashima', 'natali', 'skynyrd', 'treno', 'waba', 'calabria', 'brunos', 'moondoo', 'cogburns', 'otani', 'mocorito', 'morsi', '1918', 'torretta', 'tortaco', 'vaporcito', 'pizzetta', 'fukuda', 

In [71]:
len(set(all_tokens)), len(tokens_not_in_model)

(11182, 5137)

In [85]:
model['brioni']

KeyError: "Key 'brioni' not present"