In [95]:
import gzip
import math
import numpy as np
import random
import sklearn
import string
from collections import defaultdict
from collections import Counter
from nltk.stem.porter import *
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import dateutil
from scipy.sparse import lil_matrix # To build sparse feature matrices, if you like

In [96]:
answers = {}

In [97]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [98]:
### Question 1

In [99]:
dataset = []

f = gzip.open("steam_category.json.gz")
for l in f:
    d = eval(l)
    dataset.append(d)
    if len(dataset) >= 20000:
        break
        
f.close()

In [100]:
Ntrain = 10000
Ntest = 10000

dataTrain = dataset[:Ntrain]
dataTest = dataset[Ntrain:Ntrain + Ntest]

In [101]:
sp = set(string.punctuation)
word_counts = Counter()

def preprocess_text(text):
    return ''.join([c for c in text.lower() if c not in sp])

for review in dataTrain:
    text = preprocess_text(review['text'])
    words = text.split()
    word_counts.update(words)

counts = word_counts.most_common(1000)


In [102]:
answers['Q1'] = [(count, word) for word, count in counts[:10]]
assertFloatList([x[0] for x in answers['Q1']], 10)

In [103]:
### Question 2

In [104]:
NW = 1000 # dictionary size

In [105]:
words = [word for word, _ in counts[:NW]]  # Top 1000 words from Q1

In [106]:
# Build X...
vectorizer = CountVectorizer(vocabulary=words)  # Use the top 1000 words as the vocabulary
X = vectorizer.fit_transform([review['text'].lower() for review in dataset])  # Convert reviews to feature matrix

In [107]:
y = [review['genreID'] for review in dataset]

In [108]:
Xtrain = X[:Ntrain]
ytrain = y[:Ntrain]
Xtest = X[Ntrain:]
ytest = y[Ntrain:]

In [109]:
mod = linear_model.LogisticRegression(C=1)
mod.fit(Xtrain, ytrain)
predictions = mod.predict(Xtest)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [110]:
correct = [predictions[i] == ytest[i] for i in range(len(ytest))]

In [111]:
answers['Q2'] = sum(correct) / len(correct)
assertFloat(answers['Q2'])

In [112]:
### Question 3

In [113]:
target_words = ["character", "game", "length", "a", "it"]

df = defaultdict(int)
for review in dataTrain:
    text = preprocess_text(review['text'])
    words = set(text.split())  # Unique words in the document
    for w in words:
        df[w] += 1

N = len(dataTrain)

In [114]:
# Calculate IDF for target words
idf = {}
for word in target_words:
    idf[word] = math.log10(N / (1 + df[word]))

first_review = preprocess_text(dataTrain[0]['text'])
first_review_words = first_review.split()

tf = {}
for word in target_words:
    tf[word] = sum(1 for w in first_review_words if w == word)

# Compute TF-IDF for the target words
tfidf = {}
for word in target_words:
    tfidf[word] = tf[word] * idf[word]

In [115]:
answers['Q3'] = [(idf[word], tfidf[word]) for word in target_words]

assertFloatList([x[0] for x in answers['Q3']], 5)
assertFloatList([x[1] for x in answers['Q3']], 5)

In [116]:
### Question 4

In [117]:
# Build X and y...

In [118]:
Xtrain = X[:Ntrain]
ytrain = y[:Ntrain]
Xtest = X[Ntrain:]
ytest = y[Ntrain:]

In [119]:
mod = linear_model.LogisticRegression(C=1)
mod.fit(Xtrain, ytrain)

predictions = mod.predict(Xtest)
correct = [predictions[i] == ytest[i] for i in range(len(ytest))]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [120]:
answers['Q4'] = sum(correct) / len(correct)
assertFloat(answers['Q4'])

In [121]:
### Question 5

In [122]:
def Cosine(x1, x2):
    dot_product = sum(x1.get(word, 0) * x2.get(word, 0) for word in x1.keys() | x2.keys())
    
    norm_x1 = math.sqrt(sum(value**2 for value in x1.values()))
    norm_x2 = math.sqrt(sum(value**2 for value in x2.values()))
    
    # edge cases
    if norm_x1 == 0 or norm_x2 == 0:
        return 0.0
    
    return dot_product / (norm_x1 * norm_x2)

In [123]:
# Compute IDF
def compute_idf(dataTrain):
    idf = defaultdict(float)
    doc_count = len(dataTrain)
    word_doc_count = Counter()

    for review in dataTrain:
        words = set(preprocess_text(review['text']).split())
        for word in words:
            word_doc_count[word] += 1

    for word, count in word_doc_count.items():
        idf[word] = math.log10(doc_count / count) if count > 0 else 0.0

    return idf

idf = compute_idf(dataTrain)

def compute_tfidf(text):
    words = preprocess_text(text).split()
    tf = defaultdict(int)
    for word in words:
        tf[word] += 1
    return {word: (tf[word] / len(words)) * idf.get(word, 0.0) for word in tf}

# Check TF-IDF and similarity computation
first_review_tfidf = compute_tfidf(dataTrain[0]['text'])
test_tfidfs = [(compute_tfidf(review['text']), review.get('reviewID', None)) for review in dataTest]

In [124]:
similarities = []
for test_tfidf, reviewID in test_tfidfs:
    similarity = Cosine(first_review_tfidf, test_tfidf)
    similarities.append((similarity, reviewID))

similarities.sort(reverse=True)

In [125]:
answers['Q5'] = similarities[0]
assertFloat(answers['Q5'][0])

In [126]:
### Question 6

In [127]:
# Define possible values for NW and C
dictionary_sizes = [500, 1000, 2000]
regularization_constants = [0.01, 0.1, 1, 10]

best_accuracy = 0
best_NW = None
best_C = None

In [128]:
# Iterate over different dict size
for NW in dictionary_sizes:
    words = [word for word, _ in counts[:NW]]
    vectorizer = TfidfVectorizer(vocabulary = words)
    X = vectorizer.fit_transform([review['text'].lower() for review in dataset])
    y = [review['genreID'] for review in dataset]
    
    Xtrain = X[:Ntrain]
    ytrain = y[:Ntrain]
    Xtest = X[Ntrain:]
    ytest = y[Ntrain:]

    # Iterate over different C values
    for C in regularization_constants:
        mod = linear_model.LogisticRegression(C=C)
        mod.fit(Xtrain, ytrain)

        predictions = mod.predict(Xtest)
        accuracy = accuracy_score(ytest, predictions)

        # Update best parameters
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_NW = NW
            best_C = C

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [129]:
answers['Q6'] = best_accuracy
assertFloat(answers['Q6'])

In [130]:
### Question 7

In [131]:
import dateutil.parser

In [132]:
dataset = []

f = gzip.open("young_adult_20000.json.gz")
for l in f:
    d = eval(l)
    d['datetime'] = dateutil.parser.parse(d['date_added'])
    dataset.append(d)
    if len(dataset) >= 20000:
        break
        
f.close()

In [133]:
reviewLists = defaultdict(list)
for review in dataset:
    user = review['user_id']
    book = review['book_id']
    reviewLists[user].append((review['datetime'], book))

reviewLists = [sorted(reviews, key = lambda x: x[0]) for reviews in reviewLists.values()]
reviewLists = [[book for _, book in reviews] for reviews in reviewLists]  # Keep only book IDs

In [134]:
model5 = Word2Vec(reviewLists,
                  min_count=1, # Words/items with fewer instances are discarded
                  vector_size=5, # Model dimensionality
                  window=3, # Window size
                  sg=1) # Skip-gram model

In [135]:
first_book = reviewLists[0][0]
res = model5.wv.most_similar(first_book, topn = 5)

In [136]:
answers['Q7'] = res[:5]
assertFloatList([x[1] for x in answers['Q7']], 5)

In [137]:
f = open("answers_hw4.txt", 'w')
f.write(str(answers) + '\n')
f.close()