In [1]:
import json
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the JSON data
with open(r"C:\Users\HP-User\Downloads\dataset_final.json", 'r') as f:
    data = json.load(f)

In [2]:
import pandas as pd

# Create empty lists to store the data
sids = []
post_descriptions = []
interests = []

# Iterate over each sid and its data
for sid, sid_data in data.items():
    sids.append(sid)
    
    # Combine post descriptions into a single string
    descriptions = ' '.join(post_data['description'] for post_data in sid_data['posts'].values())
    post_descriptions.append(descriptions)
    
    interests.append(sid_data['interests'])

# Create a DataFrame
df = pd.DataFrame({
    'sid': sids,
    'posts': post_descriptions,
    'interests': interests
})

# Display the DataFrame
print(df)


               sid                                              posts  \
0     sid : 324309  roti chickenswarma pita falafel nycfoodgals ro...   
1     sid : 324470  crepemaster 5th avenue 125th 124th closing sad...   
2     sid : 324673  purple people bridge take tastes little like e...   
3     sid : 324728  dover fwends thebookinghall big night ouuuuuuu...   
4     sid : 324945  dead meme flavor anyonenncr sixtles via twitte...   
..             ...                                                ...   
280   sid : 357066  know go back shooting outfits soon saved phone...   
281  sid : 3571213            glove able get fairly close bird fli...   
282  sid : 3572469  lichen encrusted bench photobombed passing bir...   
283  sid : 3573923  afternoon much ado tolethorpe wonderful tis se...   
284  sid : 3577934  theres lots giving lent think thats brilliant ...   

                           interests  
0           [FOOD, COOKING, CUISINE]  
1             [FOOD, CURRENT EVENTS]  
2     

In [3]:
df

Unnamed: 0,sid,posts,interests
0,sid : 324309,roti chickenswarma pita falafel nycfoodgals ro...,"[FOOD, COOKING, CUISINE]"
1,sid : 324470,crepemaster 5th avenue 125th 124th closing sad...,"[FOOD, CURRENT EVENTS]"
2,sid : 324673,purple people bridge take tastes little like e...,"[COOKING, CUISINE, FRIENDSHIP]"
3,sid : 324728,dover fwends thebookinghall big night ouuuuuuu...,"[MUSIC, LIVE EVENTS, TRAVEL]"
4,sid : 324945,dead meme flavor anyonenncr sixtles via twitte...,"[ADVERTISING, SALES]"
...,...,...,...
280,sid : 357066,know go back shooting outfits soon saved phone...,"[CLOTHING, FASHION ACCESSORIES]"
281,sid : 3571213,glove able get fairly close bird fli...,[ARTS AND MUSIC]
282,sid : 3572469,lichen encrusted bench photobombed passing bir...,[ARTS AND MUSIC]
283,sid : 3573923,afternoon much ado tolethorpe wonderful tis se...,[ARTS AND MUSIC]


In [4]:
df['sid'] = df['sid'].str.replace('sid : ', '')
df

Unnamed: 0,sid,posts,interests
0,324309,roti chickenswarma pita falafel nycfoodgals ro...,"[FOOD, COOKING, CUISINE]"
1,324470,crepemaster 5th avenue 125th 124th closing sad...,"[FOOD, CURRENT EVENTS]"
2,324673,purple people bridge take tastes little like e...,"[COOKING, CUISINE, FRIENDSHIP]"
3,324728,dover fwends thebookinghall big night ouuuuuuu...,"[MUSIC, LIVE EVENTS, TRAVEL]"
4,324945,dead meme flavor anyonenncr sixtles via twitte...,"[ADVERTISING, SALES]"
...,...,...,...
280,357066,know go back shooting outfits soon saved phone...,"[CLOTHING, FASHION ACCESSORIES]"
281,3571213,glove able get fairly close bird fli...,[ARTS AND MUSIC]
282,3572469,lichen encrusted bench photobombed passing bir...,[ARTS AND MUSIC]
283,3573923,afternoon much ado tolethorpe wonderful tis se...,[ARTS AND MUSIC]


In [5]:
print(df.loc[0, 'posts'])


roti chickenswarma pita falafel nycfoodgals roti modern mediterranean pita falafel schwarma nycfoodgals zaros bakery nationalbagelday favoriteholiday nycfoodgals vintner wine market prosciutto figjam hellskitchen nyclunch nycfoodgals wayla moosarong meatballs thaifood nycfoodgals butterfly bakeshop customcake nycbakery candyland butterflybakeshop nycfoodgals butterflybakeshop giveawaynin honor pride month justsalad bringing back proud salad giving away 50 gift cards 2 followers nnto enternlike pic follow justsalad nycfoodgals tag friend comments belownn1 comment1 entrynnwinners picked wednesday 65 contest closednn1 every proud salad sold donated nycpride cinnamon snail doughnuts vegan cinnamonsnail nycfoodgals nycfoodgals patsys italian restaurant friedmozzarella cheesepull patsysitalianrestaurant 75years nycfoodgals osprey cobbsalad theosprey 1hotels brooklynbridge nycfoodgals orwashers pecanpie piday orwashers uppereastside nycfoodgals fiaschetteria pistoia tagliatelle blacktuffle eg

In [6]:
# Explode the DataFrame based on the 'interests' column
df_exploded = df.explode('interests')

# Reset the index of the exploded DataFrame
df_exploded = df_exploded.reset_index(drop=True)

# Display the exploded DataFrame
print(df_exploded)


         sid                                              posts  \
0     324309  roti chickenswarma pita falafel nycfoodgals ro...   
1     324309  roti chickenswarma pita falafel nycfoodgals ro...   
2     324309  roti chickenswarma pita falafel nycfoodgals ro...   
3     324470  crepemaster 5th avenue 125th 124th closing sad...   
4     324470  crepemaster 5th avenue 125th 124th closing sad...   
..       ...                                                ...   
497   357066  know go back shooting outfits soon saved phone...   
498  3571213            glove able get fairly close bird fli...   
499  3572469  lichen encrusted bench photobombed passing bir...   
500  3573923  afternoon much ado tolethorpe wonderful tis se...   
501  3577934  theres lots giving lent think thats brilliant ...   

               interests  
0                   FOOD  
1                COOKING  
2                CUISINE  
3                   FOOD  
4         CURRENT EVENTS  
..                   ...  
497  F

In [7]:
df_exploded

Unnamed: 0,sid,posts,interests
0,324309,roti chickenswarma pita falafel nycfoodgals ro...,FOOD
1,324309,roti chickenswarma pita falafel nycfoodgals ro...,COOKING
2,324309,roti chickenswarma pita falafel nycfoodgals ro...,CUISINE
3,324470,crepemaster 5th avenue 125th 124th closing sad...,FOOD
4,324470,crepemaster 5th avenue 125th 124th closing sad...,CURRENT EVENTS
...,...,...,...
497,357066,know go back shooting outfits soon saved phone...,FASHION ACCESSORIES
498,3571213,glove able get fairly close bird fli...,ARTS AND MUSIC
499,3572469,lichen encrusted bench photobombed passing bir...,ARTS AND MUSIC
500,3573923,afternoon much ado tolethorpe wonderful tis se...,ARTS AND MUSIC


In [8]:
# Convert 'interests' column to single quoted strings
df_exploded['interests'] = df_exploded['interests'].apply(lambda x: "'" + x + "'")

# Display the updated DataFrame
df_exploded


Unnamed: 0,sid,posts,interests
0,324309,roti chickenswarma pita falafel nycfoodgals ro...,'FOOD'
1,324309,roti chickenswarma pita falafel nycfoodgals ro...,'COOKING'
2,324309,roti chickenswarma pita falafel nycfoodgals ro...,'CUISINE'
3,324470,crepemaster 5th avenue 125th 124th closing sad...,'FOOD'
4,324470,crepemaster 5th avenue 125th 124th closing sad...,'CURRENT EVENTS'
...,...,...,...
497,357066,know go back shooting outfits soon saved phone...,'FASHION ACCESSORIES'
498,3571213,glove able get fairly close bird fli...,'ARTS AND MUSIC'
499,3572469,lichen encrusted bench photobombed passing bir...,'ARTS AND MUSIC'
500,3573923,afternoon much ado tolethorpe wonderful tis se...,'ARTS AND MUSIC'


In [9]:
grouped = df_exploded.groupby('interests')

# Create an empty list to store the grouped DataFrames
grouped_dfs = []

# Convert each group into a DataFrame
for interest, group in grouped:
    group_df = pd.DataFrame(group)
    grouped_dfs.append(group_df)

# Concatenate the grouped DataFrames back together
grouped_df = pd.concat(grouped_dfs)

# Reset the index of the DataFrame
grouped_df = grouped_df.reset_index(drop=True)

# Display the grouped DataFrame
print(grouped_df)



         sid                                              posts      interests
0     324945  dead meme flavor anyonenncr sixtles via twitte...  'ADVERTISING'
1     325362  snap guide bossing monday stop tuyolondon pick...  'ADVERTISING'
2     329462  still one favourite shoots chapelprecinct np l...  'ADVERTISING'
3     333790  exhibit garbage good morning iowa dadisfat boo...  'ADVERTISING'
4    3217585  calm sea set beautiful backdrop michjanemiller...  'ADVERTISING'
..       ...                                                ...            ...
497  4340257  terrific villa toscana miami terrific birthday...     'WEDDINGS'
498  3355851  welcome design created gervasoni familys first...     'WEDDINGS'
499  3355851  welcome design created gervasoni familys first...     'WEDDINGS'
500  3475367  divine magical unbelievable divine corporatefu...     'WEDDINGS'
501  3475367  divine magical unbelievable divine corporatefu...     'WEDDINGS'

[502 rows x 3 columns]


In [10]:
grouped_df

Unnamed: 0,sid,posts,interests
0,324945,dead meme flavor anyonenncr sixtles via twitte...,'ADVERTISING'
1,325362,snap guide bossing monday stop tuyolondon pick...,'ADVERTISING'
2,329462,still one favourite shoots chapelprecinct np l...,'ADVERTISING'
3,333790,exhibit garbage good morning iowa dadisfat boo...,'ADVERTISING'
4,3217585,calm sea set beautiful backdrop michjanemiller...,'ADVERTISING'
...,...,...,...
497,4340257,terrific villa toscana miami terrific birthday...,'WEDDINGS'
498,3355851,welcome design created gervasoni familys first...,'WEDDINGS'
499,3355851,welcome design created gervasoni familys first...,'WEDDINGS'
500,3475367,divine magical unbelievable divine corporatefu...,'WEDDINGS'


In [11]:
grouped_df['interests'].value_counts()

'FOOD'                          59
'TRAVEL'                        51
'ARTS AND MUSIC'                30
'BEAUTY'                        30
'FASHION ACCESSORIES'           30
'CLOTHING'                      28
'CUISINE'                       25
'RESTAURANTS'                   22
'SPORTS'                        20
'FAMILY AND RELATIONSHIPS'      17
'FRIENDSHIP'                    15
'WEDDINGS'                      14
'LIVE EVENTS'                   14
'CURRENT EVENTS'                14
'ADVERTISING'                   13
'MUSIC'                         12
'HOME AND GARDEN'               11
'PETS'                          11
'ARCHITECTURE'                  10
'POLITICS AND SOCIAL ISSUES'     8
'TV'                             7
'SALES'                          6
'PHYSICAL FITNESS'               6
'ALCOHOLIC BEVERAGES'            6
'DESIGN'                         6
'COOKING'                        5
'BEVERAGES'                      5
'VEHICLES'                       4
'OUTDOOR RECREATION'

In [12]:
# Drop rows with 'interests' containing "SHOPPING"
grouped_df = grouped_df[~grouped_df['interests'].str.contains('SHOPPING')]

# Reset the index of the DataFrame
grouped_df = grouped_df.reset_index(drop=True)

# Display the resulting DataFrame
grouped_df


Unnamed: 0,sid,posts,interests
0,324945,dead meme flavor anyonenncr sixtles via twitte...,'ADVERTISING'
1,325362,snap guide bossing monday stop tuyolondon pick...,'ADVERTISING'
2,329462,still one favourite shoots chapelprecinct np l...,'ADVERTISING'
3,333790,exhibit garbage good morning iowa dadisfat boo...,'ADVERTISING'
4,3217585,calm sea set beautiful backdrop michjanemiller...,'ADVERTISING'
...,...,...,...
496,4340257,terrific villa toscana miami terrific birthday...,'WEDDINGS'
497,3355851,welcome design created gervasoni familys first...,'WEDDINGS'
498,3355851,welcome design created gervasoni familys first...,'WEDDINGS'
499,3475367,divine magical unbelievable divine corporatefu...,'WEDDINGS'


In [13]:
grouped_df['interests'].value_counts()

'FOOD'                          59
'TRAVEL'                        51
'ARTS AND MUSIC'                30
'BEAUTY'                        30
'FASHION ACCESSORIES'           30
'CLOTHING'                      28
'CUISINE'                       25
'RESTAURANTS'                   22
'SPORTS'                        20
'FAMILY AND RELATIONSHIPS'      17
'FRIENDSHIP'                    15
'WEDDINGS'                      14
'LIVE EVENTS'                   14
'CURRENT EVENTS'                14
'ADVERTISING'                   13
'MUSIC'                         12
'PETS'                          11
'HOME AND GARDEN'               11
'ARCHITECTURE'                  10
'POLITICS AND SOCIAL ISSUES'     8
'TV'                             7
'PHYSICAL FITNESS'               6
'ALCOHOLIC BEVERAGES'            6
'DESIGN'                         6
'SALES'                          6
'BEVERAGES'                      5
'COOKING'                        5
'VEHICLES'                       4
'OUTDOOR RECREATION'

In [14]:
# Get the count of rows for each interest
interest_counts = grouped_df['interests'].value_counts()

# Create empty lists to store the train and test sets
train_data = []
test_data = []

# Iterate over each interest and perform train-test split
for interest in interest_counts.index:
    group = grouped_df[grouped_df['interests'] == interest]
    n_train = int(interest_counts[interest] * 0.75)
    train_examples = group.head(n_train)
    test_examples = group.tail(interest_counts[interest] - n_train)
    train_data.append(train_examples)
    test_data.append(test_examples)

# Concatenate the train and test sets
X_train = pd.concat(train_data)['posts']
X_test = pd.concat(test_data)['posts']
Y_train = pd.concat(train_data)['interests']
Y_test = pd.concat(test_data)['interests']


In [15]:
Y_test

270                'FOOD'
271                'FOOD'
272                'FOOD'
273                'FOOD'
274                'FOOD'
              ...        
178    'ENTREPRENEURSHIP'
373             'READING'
176           'ECONOMICS'
331              'MOVIES'
301         'HEALTH CARE'
Name: interests, Length: 140, dtype: object

In [16]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Encode input sequences
X_train_encoded = tokenizer.batch_encode_plus(X_train.tolist(), padding=True, truncation=True, return_tensors='pt')
X_test_encoded = tokenizer.batch_encode_plus(X_test.tolist(), padding=True, truncation=True, return_tensors='pt')

# Convert labels to one-hot encoding
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)

# Encode training labels
Y_train_encoded = label_encoder.fit_transform(Y_train)
Y_train_onehot = onehot_encoder.fit_transform(Y_train_encoded.reshape(-1, 1))

# Encode test labels
Y_test_encoded = label_encoder.transform(Y_test)
Y_test_onehot = onehot_encoder.transform(Y_test_encoded.reshape(-1, 1))

# Create PyTorch datasets
train_dataset = TensorDataset(
    X_train_encoded['input_ids'],
    X_train_encoded['attention_mask'],
    torch.tensor(Y_train_onehot, dtype=torch.float)
)
test_dataset = TensorDataset(
    X_test_encoded['input_ids'],
    X_test_encoded['attention_mask'],
    torch.tensor(Y_test_onehot, dtype=torch.float)
)

# Define BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(grouped_df['interests'].unique()))


# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define training parameters
batch_size = 8
num_epochs = 3
learning_rate = 2e-5

# Define data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {average_loss:.4f}')

   


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/3, Train Loss: 0.4772
Epoch 2/3, Train Loss: 0.2705
Epoch 3/3, Train Loss: 0.1888


In [28]:
# Evaluation
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs[0], dim=1)


        total += labels.size(0)
        correct += (predicted == labels.argmax(dim=1)).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy:.4f}')


Test Accuracy: 0.1071


In [21]:
print(model.config.num_labels)  # Should output 37


37


In [31]:
import numpy as np

# Prediction
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probabilities = torch.softmax(outputs.logits, dim=1)
        _, predicted_indices = torch.topk(probabilities, k=3, dim=1)

        for indices in predicted_indices:
            predicted_labels = label_encoder.inverse_transform(indices.cpu().numpy())
            predictions.append(predicted_labels.tolist())

# Compare predictions to actual values
actual_values = Y_test.tolist()

for i in range(len(actual_values)):
    print(f"Example {i+1}:")
    print("Predicted interests:", predictions[i])
    print("Actual interests:", actual_values[i])
    print()


Example 1:
Predicted interests: ["'FOOD'", "'TRAVEL'", "'MUSIC'"]
Actual interests: 'FOOD'

Example 2:
Predicted interests: ["'FOOD'", "'TRAVEL'", "'MUSIC'"]
Actual interests: 'FOOD'

Example 3:
Predicted interests: ["'FOOD'", "'TRAVEL'", "'MUSIC'"]
Actual interests: 'FOOD'

Example 4:
Predicted interests: ["'FOOD'", "'TRAVEL'", "'MUSIC'"]
Actual interests: 'FOOD'

Example 5:
Predicted interests: ["'FOOD'", "'TRAVEL'", "'MUSIC'"]
Actual interests: 'FOOD'

Example 6:
Predicted interests: ["'FOOD'", "'TRAVEL'", "'MUSIC'"]
Actual interests: 'FOOD'

Example 7:
Predicted interests: ["'FOOD'", "'TRAVEL'", "'MUSIC'"]
Actual interests: 'FOOD'

Example 8:
Predicted interests: ["'FOOD'", "'TRAVEL'", "'MUSIC'"]
Actual interests: 'FOOD'

Example 9:
Predicted interests: ["'FOOD'", "'TRAVEL'", "'MUSIC'"]
Actual interests: 'FOOD'

Example 10:
Predicted interests: ["'FOOD'", "'TRAVEL'", "'MUSIC'"]
Actual interests: 'FOOD'

Example 11:
Predicted interests: ["'FOOD'", "'TRAVEL'", "'MUSIC'"]
Actual inter