### Data Augmentation

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_ratings = pd.read_csv('/content/drive/MyDrive/train_dataset.csv')

In [None]:
from concurrent.futures import ThreadPoolExecutor

def augment(users, data, movies):
    subset = pd.DataFrame(columns=data.columns)
    for i, user_id in enumerate(users):
        movies_interacted_with = set(data[data['userId'] == user_id]['movieId'].values)
        movies_not_interacted_with = movies - movies_interacted_with

        for _ in range(20):
            unsuccessful_recommendation = np.random.choice(tuple(movies_not_interacted_with))
            movies_not_interacted_with.remove(unsuccessful_recommendation)
            subset = subset.append({'userId': user_id, 'movieId': unsuccessful_recommendation, 'target': 0}, ignore_index=True)

    subset = subset.sample(frac=1).reset_index(drop=True)
    return subset

In [None]:
from concurrent.futures import ThreadPoolExecutor

data = train_ratings[['userId', 'movieId']]
data['target'] = 1

unique_users = data['userId'].unique()
users = len(unique_users)
unique_movies = set(data['movieId'].unique())

futures = []
with ThreadPoolExecutor(max_workers=10000) as executor:
    for i in range(users//100 + 1):
        block_start = i * 100
        block_end = min(users, 100 * (i+1))
        print('Submitting ', i)
        futures.append(executor.submit(augment, unique_users[block_start: block_end], data, unique_movies))

for i, future in enumerate(futures):
    data = pd.concat([data, future.result()], ignore_index=True)
    print(f'Created negative data for block {i}. {len(futures) - i} left')

data.to_csv('augmented_dataset.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['target'] = 1


Submitting  0
Submitting  1
Submitting  2
Submitting  3
Submitting  4
Submitting  5
Submitting  6
Submitting  7
Submitting  8
Submitting  9
Submitting  10
Submitting  11
Submitting  12
Submitting  13
Submitting  14
Submitting  15
Submitting  16
Submitting  17
Submitting  18
Submitting  19
Submitting  20
Submitting  21
Submitting  22
Submitting  23
Submitting  24
Submitting  25
Submitting  26
Submitting  27
Submitting  28
Submitting  29
Submitting  30
Submitting  31
Submitting  32
Submitting  33
Submitting  34
Submitting  35
Submitting  36
Submitting  37
Submitting  38
Submitting  39
Submitting  40
Submitting  41
Submitting  42
Submitting  43
Submitting  44
Submitting  45
Submitting  46
Submitting  47
Submitting  48
Submitting  49
Submitting  50
Submitting  51
Submitting  52
Submitting  53
Submitting  54
Submitting  55
Submitting  56
Submitting  57
Submitting  58
Submitting  59
Submitting  60
Submitting  61
Submitting  62
Submitting  63
Submitting  64
Submitting  65
Submitting  66
Submi