In [1]:
import numpy as np
import pandas as pd 
import zipfile
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
import nltk
import string
import torch
from transformers import RobertaTokenizer, RobertaModel
from sklearn.manifold import TSNE
import umap
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.manifold import TSNE
from sklearn.naive_bayes import MultinomialNB

In [2]:
# df_rtr = pd.read_json("data.zip/renttherunway_final_data.json", lines=True)
# pd.read_csv('data.zip', compression='zip')

# Open the zip file
with zipfile.ZipFile('data.zip') as z:
    # Open the JSON file
    with z.open('data/renttherunway_final_data.json') as f:
        df_rtr = pd.read_json(f, lines=True)

df_rtr.head()

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,420272,34d,2260466,137lbs,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28.0,"April 20, 2016"
1,fit,273551,34b,153475,132lbs,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,"5' 6""",12,36.0,"June 18, 2013"
2,fit,360448,,1063761,,10.0,party,This hugged in all the right places! It was a ...,,It was a great time to celebrate the (almost) ...,sheath,"5' 4""",4,116.0,"December 14, 2015"
3,fit,909926,34c,126335,135lbs,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,"5' 5""",8,34.0,"February 12, 2014"
4,fit,151944,34b,616682,145lbs,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27.0,"September 26, 2016"


In [3]:
# Remove duplicate rows
df_rtr = df_rtr.drop_duplicates()

In [4]:
df_rtr.duplicated().sum()

0

In [5]:
df_rtr.isnull().sum()

fit                   0
user_id               0
bust size         18392
item_id               0
weight            29955
rating               81
rented for           10
review_text           0
body type         14625
review_summary        0
category              0
height              675
size                  0
age                 960
review_date           0
dtype: int64

In [6]:
# Calculate the percentage of missing values in each column
(df_rtr.isnull().sum() * 100 / len(df_rtr)).round(2)

fit                0.00
user_id            0.00
bust size          9.56
item_id            0.00
weight            15.57
rating             0.04
rented for         0.01
review_text        0.00
body type          7.60
review_summary     0.00
category           0.00
height             0.35
size               0.00
age                0.50
review_date        0.00
dtype: float64

### NLP Classifier Using TF-IDF

#### Build a simple machine learning model (like Logistic Regression or SVM) to classify fit.
This is a baseline approach where text is tokenized, converted into a numerical feature matrix, and fed into a classifier.

In [7]:
# Combine review_summary and review_text
df_rtr['combined_text'] = df_rtr['review_summary'] + " " + df_rtr['review_text']

In [8]:
# Encode Target Variable: Convert the 'fit' categories into numerical labels

custom_order = np.array(['small', 'fit', 'large'])

# Create LabelEncoder and set classes_
le = LabelEncoder()
le.classes_ = custom_order

# Transform target variable
df_rtr['fit_encoded'] = le.transform(df_rtr['fit'])

In [9]:
# Apply TF-IDF with stop words removed
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df_rtr['combined_text'])
y = df_rtr['fit_encoded']  # Target variable

#####
X is the matrix created after transforming review_text (or combined text, including review_summary) into numerical form using techniques like TF-IDF Vectorizer. 

For instance, with TfidfVectorizer, X will be a sparse matrix where rows are reviews and columns are the top 5,000 most important words (based on TF-IDF scores).


In [10]:
print(X)

  (0, 1045)	0.2033926255302943
  (0, 250)	0.26817707799101004
  (0, 3656)	0.2639702230665475
  (0, 524)	0.24374466411047713
  (0, 4993)	0.19984675536677607
  (0, 2565)	0.11346860992392349
  (0, 2088)	0.22597263015191554
  (0, 2867)	0.45424084080457444
  (0, 1250)	0.18160078836542434
  (0, 4839)	0.10461768100123445
  (0, 494)	0.31092890408024376
  (0, 4732)	0.2382951339460545
  (0, 1608)	0.22301854855442604
  (0, 4904)	0.19439727282251085
  (0, 3250)	0.19186631731094558
  (0, 191)	0.17819060605480072
  (0, 3148)	0.10221864303060015
  (0, 2000)	0.1259896593131114
  (0, 2761)	0.26001784396666633
  (1, 1700)	0.13960742542494722
  (1, 1968)	0.4041693646230863
  (1, 3547)	0.16086778757190046
  (1, 1435)	0.12054548780817384
  (1, 3175)	0.23752336203628668
  (1, 3890)	0.32499001822132983
  :	:
  (192354, 3492)	0.08735222503519205
  (192354, 1738)	0.04937602893230258
  (192354, 4924)	0.12013544041329292
  (192354, 301)	0.09862422272162094
  (192354, 2908)	0.07573449463295553
  (192354, 4849)	0.

In [11]:
##

In [12]:
# Combine review_summary and review_text
df_rtr['combined_text'] = df_rtr['review_summary'] + " " + df_rtr['review_text']


In [13]:
# Encode target variable
custom_order = np.array(['small', 'fit', 'large'])
le = LabelEncoder()
le.classes_ = custom_order
df_rtr['fit_encoded'] = le.transform(df_rtr['fit'])

In [14]:
# Train-test split
X = df_rtr['combined_text']
y = df_rtr['fit_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
# # Define pipeline
# pipeline = Pipeline([
#     ('tfidf', TfidfVectorizer(stop_words='english')),
#     ('clf', LogisticRegression(max_iter=1000))
# ])

# # Define grid search parameters
# param_grid = {
#     'tfidf__max_features': [3000, 5000, 10000],
#     'tfidf__ngram_range': [(1, 1), (1, 2)],
#     'clf__C': [0.1, 1, 10],
# }

# # Perform grid search
# grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
# grid_search.fit(X_train, y_train)

# # Best model evaluation
# print("Best Parameters:", grid_search.best_params_)
# print("Best Cross-Validation Score:", grid_search.best_score_)
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_test)
# print(classification_report(y_test, y_pred, target_names=le.classes_))

In [16]:
##

In [17]:
# Define the Pipeline with a Placeholder: Use a placeholder for the classifier step, such as 'clf'.
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression())  # Placeholder; will be replaced by other classifiers
])

In [18]:
# # Define the Parameter Grid: Include different classifiers and their respective hyperparameters in the parameter grid.
# param_grid = [
#     {
#         'tfidf__max_features': [3000, 5000],
#         'tfidf__ngram_range': [(1, 1), (1, 2)],
#         'clf': [LogisticRegression(max_iter=1000)],
#         'clf__C': [0.1, 1, 10]  # Logistic Regression hyperparameters
#     },
# #     {
# #         'tfidf__max_features': [3000, 5000],
# #         'tfidf__ngram_range': [(1, 1), (1, 2)],
# #         'clf': [RandomForestClassifier()],
# #         'clf__n_estimators': [100, 200],  # Random Forest hyperparameters
# #         'clf__max_depth': [None, 10]
# #     },
# #     {
# #         'tfidf__max_features': [3000, 5000],
# #         'tfidf__ngram_range': [(1, 1), (1, 2)],
# #         'clf': [SVC()],
# #         'clf__C': [0.1, 1, 10],  # SVM hyperparameters
# #         'clf__kernel': ['linear', 'rbf']
# #     }
# ]

#####
When dealing with classification tasks, especially those with imbalanced classes, the F1-score is often a better metric than accuracy. The F1-score is the harmonic mean of precision and recall, and it provides a more balanced view of the model's performance.

**f1_weighted** calculates the F1-score for each class and weights it by the number of true instances in each class. This ensures that all classes are fairly represented in the final score.

In [19]:
# # Run Grid Search: Use GridSearchCV to find the best combination of vectorizer settings, classifier, and hyperparameters.
# grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1_weighted', n_jobs=-1)
# grid_search.fit(X_train, y_train)

In [20]:
# # Evaluate the Best Model: After training, evaluate the best model on the test set.
# print("Best Parameters:", grid_search.best_params_)
# print("Best Cross-Validation Score:", grid_search.best_score_)

# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_test)
# print(classification_report(y_test, y_pred, target_names=le.classes_))

In [21]:
###

In [22]:
# # Define the Parameter Grid: Include different classifiers and their respective hyperparameters in the parameter grid.
# param_grid = [
#     {
#         'tfidf__max_features': [3000, 5000],
#         'tfidf__ngram_range': [(1, 1), (1, 2)],
#         'clf': [LogisticRegression(max_iter=1000)],
#         'clf__C': [0.1, 1, 10]  # Logistic Regression hyperparameters
#     },
#     {
#         'tfidf__max_features': [3000, 5000],
#         'tfidf__ngram_range': [(1, 1), (1, 2)],
#         'clf': [MultinomialNB()],
#         'clf__alpha': [0.01, 0.1, 1, 10]  # Smoothing parameter for Naive Bayes
#     },
#      {
#          'tfidf__max_features': [3000, 5000],
#          'tfidf__ngram_range': [(1, 1), (1, 2)],
#          'clf': [SVC()],
#          'clf__C': [0.1, 1, 10],  # SVM hyperparameters
#          'clf__kernel': ['linear', 'rbf']
#      }
# ]

In [23]:
# Define the Parameter Grid: Include different classifiers and their respective hyperparameters in the parameter grid.
param_grid = [
    {
        'tfidf__max_features': [5000],  # Use only one value
        'tfidf__ngram_range': [(1, 1)],  # Stick to unigrams
        'clf': [LogisticRegression(max_iter=1000)],
        'clf__C': [1]  # Reduce the range
    },
#     {
#         'tfidf__max_features': [5000],
#         'tfidf__ngram_range': [(1, 1)],
#         'clf': [MultinomialNB()],
#         'clf__alpha': [1]  # Use fewer values for alpha
#     },
#     {
#         'tfidf__max_features': [5000],
#         'tfidf__ngram_range': [(1, 1)],
#         'clf': [SVC()],
#         'clf__C': [1],  # Reduce C values
#         'clf__kernel': ['linear']  # Test only linear kernel
#     }
]


In [24]:
# Define the Parameter Grid: Include different classifiers and their respective hyperparameters in the parameter grid.
param_grid = [
    {
        'tfidf__max_features': [5000],  # Use only one value
        'tfidf__ngram_range': [(1, 1)],  # Stick to unigrams
        'clf': [LogisticRegression(max_iter=1000)],
        'clf__C': [1]  # Reduce the range
    },
    {
        'tfidf__max_features': [5000],
        'tfidf__ngram_range': [(1, 1)],
        'clf': [MultinomialNB()],
        'clf__alpha': [1]  # Use fewer values for alpha
    },
#    {
#         'tfidf__max_features': [5000],
#         'tfidf__ngram_range': [(1, 1)],
#         'clf': [SVC()],
#         'clf__C': [1],  # Reduce C values
#         'clf__kernel': ['linear']  # Test only linear kernel
#     }
]

In [25]:
# Run Grid Search: Use GridSearchCV to find the best combination of vectorizer settings, classifier, and hyperparameters.
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [26]:
# Evaluate the Best Model: After training, evaluate the best model on the test set.
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))

Best Parameters: {'clf': LogisticRegression(max_iter=1000), 'clf__C': 1, 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1)}
Best Cross-Validation Score: 0.7865177991490384
              precision    recall  f1-score   support

       small       0.68      0.39      0.50      5078
         fit       0.83      0.95      0.88     28414
       large       0.71      0.40      0.51      4979

    accuracy                           0.81     38471
   macro avg       0.74      0.58      0.63     38471
weighted avg       0.79      0.81      0.79     38471



### Word Embedding Models (BERT)

####
Use pre-trained deep learning models, like BERT or RoBERTA, to get rich, contextualized word embeddings for the review_text.

**What This Means:** 
Word embeddings represent the semantic meaning of text in a high-dimensional space.
By averaging token embeddings, you get a single vector representing the entire review.

**Why Use These?:** 
Embeddings capture more nuanced meanings compared to TF-IDF.
Useful for complex tasks where semantic understanding matters.

**Tools:** 
Use Hugging Face's transformers library to easily load and apply pre-trained models.

In [27]:
# Load pre-trained BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

In [28]:
# Tokenize text
inputs = tokenizer(df_rtr['combined_text'].tolist(), padding=True, truncation=True, return_tensors='pt')


In [29]:
##

In [30]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained('distilbert-base-uncased')

####
Limit Input Sequence Length

By default, models like BERT support a maximum sequence length of 512 tokens. Reducing this length (e.g., to 128) significantly lowers memory usage.

####
Use a pre-trained model like BERT to extract embeddings for reviews:

In [31]:
inputs = tokenizer(df_rtr['combined_text'].tolist(), 
                   padding=True, 
                   truncation=True, 
                   max_length=128, 
                   return_tensors='pt')

In [None]:
# Ensure you have combined_text in your DataFrame
df_rtr['combined_text'] = df_rtr['review_summary'] + " " + df_rtr['review_text']

# Extract the text data
texts = df_rtr['combined_text'].tolist()  # Convert to a list for batch processing

# Batch Processing - loops through texts in chunks of size batch_size to process them without overwhelming memory.
batch_size = 8 # Start with a small batch size to avoid crashes
for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i + batch_size]
    inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt')
    
    with torch.no_grad():
        outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
        np.save(f'batch_{i}.npy', batch_embeddings)

# Load and concatenate embeddings
embeddings = np.concatenate([np.load(f'batch_{i}.npy') for i in range(0, len(texts), batch_size)])



####
Use these embeddings as features for classification or clustering.

#### Clustering and Visualization:

Apply dimensionality reduction to embeddings:

In [None]:
tsne = TSNE(n_components=2, random_state=42)
reduced_embeddings = tsne.fit_transform(embeddings) 

In [None]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform the training data
X_train_vectorized = vectorizer.fit_transform(X_train)

tsne = TSNE(n_components=2, random_state=42, init='random')
reduced_embeddings = tsne.fit_transform(X_train_vectorized)  # Use the same data as y_train

In [None]:
# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_train_vectorized = vectorizer.fit_transform(X_train)

# TSNE with sparse input (using init="random")
tsne = TSNE(n_components=2, random_state=42, init="random")
reduced_embeddings = tsne.fit_transform(X_train_vectorized)

# Visualize the TSNE output
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=y_train, cmap="viridis", s=10, alpha=0.7)
plt.colorbar(label="Fit Categories")
plt.xlabel("TSNE Component 1")
plt.ylabel("TSNE Component 2")
plt.title("TSNE Clusters Colored by Fit")
plt.show()

In [None]:
# Scatter plot of the reduced embeddings
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], s=10, alpha=0.7)
plt.title("TSNE Visualization of Embeddings")
plt.xlabel("TSNE Component 1")
plt.ylabel("TSNE Component 2")
plt.show()

In [None]:
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=y_train, cmap='viridis', s=10, alpha=0.7)
plt.colorbar(label="Fit Categories")  # Add a colorbar to indicate categories
plt.title("TSNE Clusters Colored by Fit")
plt.xlabel("TSNE Component 1")
plt.ylabel("TSNE Component 2")
plt.show()

In [None]:
# tsne = TSNE(n_components=2, random_state=42)
# reduced_embeddings = tsne.fit_transform(embeddings.numpy())

In [None]:
# Perform TSNE only on the training set
tsne = TSNE(n_components=2, random_state=42)
reduced_embeddings = tsne.fit_transform(X_train)  # Ensure X_train matches y_train

####
Visualize clusters:

In [None]:
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=y_train, cmap='viridis')
plt.colorbar()
plt.show()



####  Combine with Other Features:

Concatenate embeddings with features like age or manufacturer for richer analytics:

In [None]:
final_features = np.hstack([embeddings.numpy(), df_rtr[['age', 'rating']].values])

