In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import plotly.express as px
from transformers import pipeline
from tqdm import tqdm

In [4]:
np.random.seed(42)

### Exploring the data
Data source: https://www.kaggle.com/datasets/najzeko/steam-reviews-2021/data

In [5]:
# Get the total number of rows in the CSV file
total_rows = sum(1 for row in open('data/steam_reviews.csv')) - 1  # subtract 1 for header
sample_size = int(total_rows * 0.1)

# Read only 10% of the CSV file
steam_reviews = pd.read_csv('data/steam_reviews.csv', skiprows=lambda x: x > 0 and np.random.rand() > 0.1, nrows=sample_size)

In [6]:
# Get all unique app names present in the dataset
app_names = steam_reviews['app_name'].unique()
app_names = app_names.tolist()
app_names

['The Witcher 3: Wild Hunt',
 'Half-Life',
 'Counter-Strike: Source',
 'Half-Life 2: Episode Two',
 'Portal 2',
 'X Rebirth',
 "Garry's Mod",
 "Sid Meier's Civilization V",
 'Dead by Daylight',
 "Sid Meier's Civilization VI",
 'Subnautica',
 'Human: Fall Flat',
 'Beat Saber',
 'Cold Waters',
 'Banished',
 'Celeste',
 'Getting Over It with Bennett Foddy',
 'A Hat in Time',
 'Overcooked! 2',
 'Slipstream',
 'The Forest',
 'Pogostuck: Rage With Your Friends',
 'PC Building Simulator',
 'RollerCoaster Tycoon World',
 'NBA 2K18',
 'NBA 2K21',
 'Deus Ex: The Fall',
 'Rapture Rejects',
 'Artifact',
 'Call of Duty: Infinite Warfare',
 'Cube World',
 'NBA 2K19',
 'Nether',
 'Wolfenstein: Youngblood',
 'Warhammer 40,000: Dawn of War III',
 'Takedown: Red Sabre',
 'ATLAS',
 'Stay Out',
 'Identity',
 'Umbrella Corps',
 'Hunt Down The Freeman',
 'WWE 2K20',
 'Down To One',
 'Axiom Verge',
 'Guacamelee! Super Turbo Championship Edition',
 'The Binding of Isaac: Rebirth',
 'To the Moon',
 'Cave Story

In [7]:
# Number of reviews in English
steam_reviews[(steam_reviews["language"] == "english")].shape

(962962, 23)

In [8]:
#Possible review Bombing for GTA V between 2017-06-01 and 2017-07-31
one_game_only_english = steam_reviews[(steam_reviews["app_name"].str.contains("Grand Theft Auto", case = False)) 
                                      & (steam_reviews["language"] == "english")
                                      & (steam_reviews["recommended"] == False)
                                      & (steam_reviews["timestamp_updated"] > 1496268000)
                                      & (steam_reviews["timestamp_updated"] < 1501538399)
                                      & ((steam_reviews["review"].str.contains("Take-Two", case = False)) 
                                         | (steam_reviews["review"].str.contains("OpenIV", case = False)))
                                      & (steam_reviews["votes_funny"] == 0)]
one_game_only_english.head()

Unnamed: 0.1,Unnamed: 0,app_id,app_name,review_id,language,review,timestamp_created,timestamp_updated,recommended,votes_helpful,...,steam_purchase,received_for_free,written_during_early_access,author.steamid,author.num_games_owned,author.num_reviews,author.playtime_forever,author.playtime_last_two_weeks,author.playtime_at_review,author.last_played
1349925,13496915,271590,Grand Theft Auto V,33812000,english,OpenIV fiasco shows how a company tries to dic...,1501085473,1501085531,False,0,...,True,False,False,76561198024673004,15,2,10175.0,0.0,4410.0,1509200000.0
1350441,13501704,271590,Grand Theft Auto V,33345032,english,Take Two basically killed their own reputation...,1499439418,1499439567,False,2,...,False,False,False,76561198087267083,20,2,9593.0,0.0,1280.0,1565367000.0
1350443,13501715,271590,Grand Theft Auto V,33343849,english,What's done is done. Removing OpenIV greatly d...,1499436776,1499436776,False,2,...,True,False,False,76561198239439730,24,1,16134.0,0.0,7453.0,1592070000.0
1350493,13502206,271590,Grand Theft Auto V,33307413,english,You focus on banning single player modding not...,1499337665,1499337665,False,5,...,True,False,False,76561198045782126,121,20,32323.0,0.0,18078.0,1589024000.0
1350624,13503350,271590,Grand Theft Auto V,33238809,english,Here's my philosophy on the games as it relate...,1499187311,1499187311,False,1,...,True,False,False,76561198070812011,439,26,863.0,0.0,,1591927000.0


In [9]:
total_gta_reviews = steam_reviews[(steam_reviews["app_name"].str.contains("Grand Theft Auto", case = False))
                                  & (steam_reviews["language"] == "english")
                                  & (steam_reviews["timestamp_updated"] > 1496268000)
                                  & (steam_reviews["timestamp_updated"] < 1501538399)]
total_gta_reviews.head()

Unnamed: 0.1,Unnamed: 0,app_id,app_name,review_id,language,review,timestamp_created,timestamp_updated,recommended,votes_helpful,...,steam_purchase,received_for_free,written_during_early_access,author.steamid,author.num_games_owned,author.num_reviews,author.playtime_forever,author.playtime_last_two_weeks,author.playtime_at_review,author.last_played
1349836,13496038,271590,Grand Theft Auto V,33917530,english,This Positive Review was Ruined by the Error\n...,1501462530,1501462530,False,1,...,True,False,False,76561198033281896,143,1,25309.0,0.0,16139.0,1598193000.0
1349838,13496053,271590,Grand Theft Auto V,33915584,english,This game is fun and ha a lot of aspects to it...,1501453984,1501453984,True,0,...,True,True,False,76561198332312545,18,1,2823.0,65.0,2659.0,1610338000.0
1349846,13496157,271590,Grand Theft Auto V,33901816,english,░░░░░░░░░░░░░░░░░░░░░░█████████\n░░███████░░░░...,1501411178,1501411178,True,1,...,True,False,False,76561198035380665,129,74,17746.0,0.0,15743.0,1579447000.0
1349848,13496167,271590,Grand Theft Auto V,33900688,english,Always a have a fun time playing online and th...,1501407064,1501407064,True,1,...,True,False,False,76561197984771212,356,2,8695.0,0.0,837.0,1526875000.0
1349849,13496172,271590,Grand Theft Auto V,33899864,english,lets m e m e the community by banning people f...,1501403596,1501403596,False,2,...,True,False,False,76561198085821142,182,6,10094.0,0.0,6345.0,1580589000.0


In [10]:
# Compare the number of rows of both tables
rows_one_game_only_english = one_game_only_english.shape[0]
rows_total_gta_reviews = total_gta_reviews.shape[0]

print(f"Number of rows in one_game_only_english: {rows_one_game_only_english}")
print(f"Number of rows in total_gta_reviews: {rows_total_gta_reviews}")
print(f"Percentage of review bombing: {rows_one_game_only_english / rows_total_gta_reviews * 100:.2f}%")

Number of rows in one_game_only_english: 591
Number of rows in total_gta_reviews: 4111
Percentage of review bombing: 14.38%


### Setting the basis for the graph to build

In [11]:
steam_reviews_english = steam_reviews[(steam_reviews["language"] == "english")]
steam_reviews_english = steam_reviews_english[['app_id', 'app_name', 'review', 'review_id','timestamp_created', 'timestamp_updated', 'recommended', 'author.steamid', 'weighted_vote_score']]
steam_reviews_english = steam_reviews_english.sort_values(by=['timestamp_created', 'timestamp_updated'], ascending=[True, True])

# Replace the author.steamid with smaller numbers starting from 0
steam_reviews_english['author.steamid'] = pd.factorize(steam_reviews_english['author.steamid'])[0]

# Replace the app_id with smaller numbers starting from 0
steam_reviews_english['app_id'] = pd.factorize(steam_reviews_english['app_id'])[0]

# Replace the review_id with smaller numbers starting from 0

steam_reviews_english['review_id'] = pd.factorize(steam_reviews_english['review_id'])[0]

steam_reviews_english.head()

Unnamed: 0,app_id,app_name,review,review_id,timestamp_created,timestamp_updated,recommended,author.steamid,weighted_vote_score
156030,0,Garry's Mod,For creative & awesome people,0,1290197836,1290197836,True,0,0.0
156029,0,Garry's Mod,If you don't own Garry's Mod. You aren't truel...,1,1290198787,1386116339,True,1,0.472947
156028,0,Garry's Mod,TOP NOTCH GAME RIGHT HERE.\nSo many ways to pl...,2,1290200117,1385462680,True,2,0.0
156027,0,Garry's Mod,You All Should have this game by now,3,1290202964,1290202964,True,3,0.0
156026,0,Garry's Mod,Is a fantastic sandbox which you can give 👍 to...,4,1290208819,1561670109,True,4,0.584615


In [12]:
# Reviews that were updated
steam_reviews_updated = steam_reviews_english[(steam_reviews_english['timestamp_created'] != steam_reviews_english['timestamp_updated'])]

# Reviews that were not updated
steam_reviews_not_updated = steam_reviews_english[(steam_reviews_english['timestamp_created'] == steam_reviews_english['timestamp_updated'])]

In [13]:
steam_reviews_updated.head()

Unnamed: 0,app_id,app_name,review,review_id,timestamp_created,timestamp_updated,recommended,author.steamid,weighted_vote_score
156029,0,Garry's Mod,If you don't own Garry's Mod. You aren't truel...,1,1290198787,1386116339,True,1,0.472947
156028,0,Garry's Mod,TOP NOTCH GAME RIGHT HERE.\nSo many ways to pl...,2,1290200117,1385462680,True,2,0.0
156026,0,Garry's Mod,Is a fantastic sandbox which you can give 👍 to...,4,1290208819,1561670109,True,4,0.584615
156023,0,Garry's Mod,Taking the aspects of the Source Engine and ex...,7,1290219743,1543063053,True,7,0.0
156020,0,Garry's Mod,funnest game ever you do anything you want,10,1290242820,1385416637,True,10,0.0


In [14]:
steam_reviews_not_updated.head()

Unnamed: 0,app_id,app_name,review,review_id,timestamp_created,timestamp_updated,recommended,author.steamid,weighted_vote_score
156030,0,Garry's Mod,For creative & awesome people,0,1290197836,1290197836,True,0,0.0
156027,0,Garry's Mod,You All Should have this game by now,3,1290202964,1290202964,True,3,0.0
156025,0,Garry's Mod,Garrys Mod gives you endless possibilites from...,5,1290211699,1290211699,True,5,0.0
156024,0,Garry's Mod,Best game ever,6,1290216760,1290216760,True,6,0.0
156022,0,Garry's Mod,That terrorist is a spy,8,1290224426,1290224426,True,8,0.0


In [15]:
def create_review_df(df):
    data = []
    for _, row in df.iterrows():
        app_id = row['app_id']
        app_name = row['app_name']
        review_id = row['review_id']
        review = row['review']
        timestamp = row['timestamp_created']
        timestamp_updated = row['timestamp_updated']
        recommended = row['recommended']
        author_id = row['author.steamid']
        weighted_vote_score = row['weighted_vote_score']
        data.append([app_id, app_name, review, review_id,timestamp, recommended, author_id, weighted_vote_score])
        # Add the updated review if it exists
        if timestamp != timestamp_updated:
            data.append([app_id, app_name, review, review_id,timestamp_updated, recommended, author_id, weighted_vote_score])
    
    new_df = pd.DataFrame(data, columns=['app_id', 'app_name', 'review', 'review_id', 
                                         'timestamp', 'recommended', 'author_id', 
                                         'weighted_vote_score'])
    new_df = new_df.sort_values(by=['timestamp'], ascending=[True])
    return new_df

In [16]:
def merge_and_order_reviews(df1, df2, parameter = 'timestamp'):
    return pd.concat([df1, df2]).sort_values(by=[parameter], ascending=[True])

In [17]:
steam_reviews_updated = create_review_df(steam_reviews_updated)
steam_reviews_not_updated = create_review_df(steam_reviews_not_updated)
steam_reviews_all = merge_and_order_reviews(steam_reviews_updated, steam_reviews_not_updated)

In [18]:
steam_reviews_all.head()

Unnamed: 0,app_id,app_name,review,review_id,timestamp,recommended,author_id,weighted_vote_score
0,0,Garry's Mod,For creative & awesome people,0,1290197836,True,0,0.0
0,0,Garry's Mod,If you don't own Garry's Mod. You aren't truel...,1,1290198787,True,1,0.472947
2,0,Garry's Mod,TOP NOTCH GAME RIGHT HERE.\nSo many ways to pl...,2,1290200117,True,2,0.0
1,0,Garry's Mod,You All Should have this game by now,3,1290202964,True,3,0.0
4,0,Garry's Mod,Is a fantastic sandbox which you can give 👍 to...,4,1290208819,True,4,0.584615


In [19]:
steam_reviews_all['review'].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  steam_reviews_all['review'].fillna('', inplace=True)


### Parse the reviews as LIWC feature vector usign VADER

In [20]:
import nltk

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/gonzalo/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [21]:
# Bad review of GTA V
review = steam_reviews_all[(steam_reviews_all['author_id'] == 67820)
                           & (steam_reviews_all['timestamp'] == 1428975119)]['review'].values[0]
review

'I love this game....\nBut i HAVE to do it.\nTakeTwo have to know they f*cked up.\nKilling the best selling point for the game right now is just dumb.\nGTA V storyline is great but we haven\'t recieved our story DLC yet... now we only have GTAO .....AAAAND they\'ve just killed it.\nTakeTwo, please, fire your lawyers, they took your money for their "work" and for future game sales.\n\n#BringBackOpenIV'

In [22]:
# Test the sentiment analyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Create an instance of the Vader sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# List of example texts to analyze
texts = [
    review 
]

# Loop through the texts and get the sentiment scores for each one
for text in texts:
    scores = analyzer.polarity_scores(text)
    print(text)
    print(type(scores), scores)

I love this game....
But i HAVE to do it.
TakeTwo have to know they f*cked up.
Killing the best selling point for the game right now is just dumb.
GTA V storyline is great but we haven't recieved our story DLC yet... now we only have GTAO .....AAAAND they've just killed it.
TakeTwo, please, fire your lawyers, they took your money for their "work" and for future game sales.

#BringBackOpenIV
<class 'dict'> {'neg': 0.205, 'neu': 0.619, 'pos': 0.175, 'compound': -0.5994}


In [23]:
# Function to get the sentiment score all reviews in the dataset
def get_sentiment_scores(df, text_column_name='review'):
    # Create an instance of the Vader sentiment analyzer
    analyzer = SentimentIntensityAnalyzer()
    
    # List to store the sentiment scores
    sentiment_scores = []
    
    # Loop through all the reviews
    for _, row in df.iterrows():
        # Get the review text
        review = row[text_column_name]

        if type(review) != str:
            print("Review is Not a String")
            print(row)
        # Get the sentiment scores
        scores = analyzer.polarity_scores(review)

        # Transform the scores into a array
        scores = [scores['neg'], scores['neu'], scores['pos'], scores['compound']]

        # Append the scores to the list
        sentiment_scores.append(scores)
    
    return sentiment_scores

In [24]:
sentiment_scores_steam_reviews = get_sentiment_scores(steam_reviews_all)
sentiment_scores_steam_reviews[:5]

[[0.0, 0.222, 0.778, 0.7906],
 [0.0, 1.0, 0.0, 0.0],
 [0.0, 0.701, 0.299, 0.8962],
 [0.0, 1.0, 0.0, 0.0],
 [0.033, 0.843, 0.124, 0.9015]]

In [25]:
# Add the sentiment scores to the DataFrame
sentiment_columns = ['neg', 'neu', 'pos', 'compound']
steam_reviews_all_vader = steam_reviews_all.copy()
steam_reviews_all_vader[sentiment_columns] = pd.DataFrame(sentiment_scores_steam_reviews, index=steam_reviews_all.index)
steam_reviews_all_vader.head()

Unnamed: 0,app_id,app_name,review,review_id,timestamp,recommended,author_id,weighted_vote_score,neg,neu,pos,compound
0,0,Garry's Mod,For creative & awesome people,0,1290197836,True,0,0.0,0.0,0.222,0.778,0.7906
0,0,Garry's Mod,If you don't own Garry's Mod. You aren't truel...,1,1290198787,True,1,0.472947,0.0,1.0,0.0,0.0
2,0,Garry's Mod,TOP NOTCH GAME RIGHT HERE.\nSo many ways to pl...,2,1290200117,True,2,0.0,0.0,0.701,0.299,0.8962
1,0,Garry's Mod,You All Should have this game by now,3,1290202964,True,3,0.0,0.0,1.0,0.0,0.0
4,0,Garry's Mod,Is a fantastic sandbox which you can give 👍 to...,4,1290208819,True,4,0.584615,0.033,0.843,0.124,0.9015


### Attempt to vectorize using Roberta Pre-trained Model
Source: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest

In [18]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax

In [19]:
def preprocess_text(text):
    new_text = []
    for t in text.split(" "):
        t = "@user" if t.startswith("@") and len(t) > 1 else t
        t = "http" if t.startswith("http") else t
        new_text.append(t)
    return " ".join(new_text)

In [22]:
# Testing the model
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

model = AutoModelForSequenceClassification.from_pretrained(MODEL)

text = review

text = preprocess_text(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = model.config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i + 1}) {l} {np.round(float(s), 4)}")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1) negative 0.8293
2) neutral 0.1147
3) positive 0.056


In [23]:
def get_sentiment_scores_roberta(df, text_column_name='review'):
    sentiment_scores = []
    for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing reviews"):
        review = row[text_column_name]
        review = preprocess_text(review)
        encoded_input = tokenizer(review, return_tensors='pt', padding=True, truncation=True, max_length=512)
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        sentiment_scores.append(scores)
    return sentiment_scores

In [None]:
# Ensure the tokenizer and model are using the same configuration and vocabulary
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

model = AutoModelForSequenceClassification.from_pretrained(MODEL)

sentiment_scores_steam_reviews_roberta = get_sentiment_scores_roberta(steam_reviews_all)
sentiment_scores_steam_reviews_roberta[:5]

In [None]:
# Add the sentiment scores to the DataFrame
sentiment_columns = ['neg', 'neu', 'pos']
steam_reviews_all_roberta = steam_reviews_all.copy()
steam_reviews_all_roberta[sentiment_columns] = pd.DataFrame(sentiment_scores_steam_reviews_roberta, index=steam_reviews_all.index)
steam_reviews_all_roberta.head()

### The network should be in the following format:

- One line per interaction/edge.
- Each line should be: user, item, timestamp, state label, comma-separated array of features.
- First line is the network format.
- User and item fields can be alphanumeric.
- Timestamp should be in cardinal format (not in datetime).
- State label should be 1 whenever the user state changes, 0 otherwise. If there are no state labels, use 0 for all interactions.
- Feature list can be as long as desired. It should be atleast 1 dimensional. If there are no features, use 0 for all interactions.


In [11]:
def transform_to_network(df):
    network_data = []
    for _, row in df.iterrows():
        user = row['author.steamid']
        item = row['app_id']
        timestamp = row['timestamp']
        state_label = 0
        negative = row['neg']
        neutral = row['neu']
        positive = row['pos']
        # Add the features list to the network data
        array_to_append = [user, item, timestamp, state_label, negative, neutral, positive]
        network_data.append(array_to_append)
    # Create a DataFrame from the network data
    network_df = pd.DataFrame(network_data, columns=['user_id', 'item_id', 'timestamp', 'state_label', 'negative', 'neutral', 'positive'])
    return network_df

In [12]:
# Transform the steam_reviews in english DataFrame
network_df = transform_to_network(steam_reviews_english)

# Display the first few rows of the transformed network DataFrame
network_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0,0,1290197836,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,1290198787,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.472947
2,2,0,1290200117,0,1,0.350455,0.285439,0.0,0.0,0.361794,0.0,0.337014,0.0,0.74245,0.0,0.0
3,3,0,1290202964,0,1,0.0,0.451364,0.0,0.0,0.0,0.0,0.0,0.595071,0.0,0.664952,0.0
4,4,0,1290208819,0,1,0.245909,0.200288,0.107051,0.265318,0.0,0.092031,0.236478,0.088019,0.520967,0.688487,0.584615


In [13]:
network_df.rename(columns={'negative': 'comma_separated_list_of_features', 'neutral': '', 'positive': ''}, inplace=True)
network_df.head()

Unnamed: 0,user_id,item_id,timestamp,state_label,comma_separated_list_of_features,5,6,7,8,9,10,11,12,13,14,15
0,0,0,1290197836,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,1290198787,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.472947
2,2,0,1290200117,0,1,0.350455,0.285439,0.0,0.0,0.361794,0.0,0.337014,0.0,0.74245,0.0,0.0
3,3,0,1290202964,0,1,0.0,0.451364,0.0,0.0,0.0,0.0,0.0,0.595071,0.0,0.664952,0.0
4,4,0,1290208819,0,1,0.245909,0.200288,0.107051,0.265318,0.0,0.092031,0.236478,0.088019,0.520967,0.688487,0.584615


In [15]:
# Save the network DataFrame to a CSV file
network_df.to_csv('data/steam.csv', index=False)

## Understanding dataset MOOC

In [11]:
mooc = pd.read_csv('data/mooc.csv', sep=',')
mooc.head()

Unnamed: 0,user_id,item_id,timestamp,state_label,comma_separated_list_of_features,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,0,0,0.0,0,-0.319991,-0.435701,0.106784,-0.067309
1,0,1,6.0,0,-0.319991,-0.435701,0.106784,-0.067309
2,0,2,41.0,0,-0.319991,-0.435701,0.106784,-0.067309
3,0,1,49.0,0,-0.319991,-0.435701,0.106784,-0.067309
4,0,2,51.0,0,-0.319991,-0.435701,0.106784,-0.067309


In [13]:
mooc[(mooc['state_label'] == 1)]

Unnamed: 0,user_id,item_id,timestamp,state_label,comma_separated_list_of_features,Unnamed: 5,Unnamed: 6,Unnamed: 7
135,4,16,39882.0,1,-0.319991,2.108722,-0.394237,-0.067309
307,42,14,61818.0,1,1.028091,-0.435701,-0.394237,-0.067309
381,46,5,65572.0,1,-0.319991,-0.435701,0.106784,-0.067309
459,68,9,67169.0,1,-0.319991,-0.435701,2.611890,5.351483
626,87,13,69018.0,1,-0.319991,2.108722,-0.394237,-0.067309
...,...,...,...,...,...,...,...,...
411457,7045,16,2546890.0,1,-0.319991,2.108722,-0.394237,-0.067309
411658,6863,25,2561266.0,1,-0.319991,-0.435701,0.607805,1.538259
411661,7046,5,2561994.0,1,-0.319991,-0.435701,0.106784,-0.067309
411665,31,96,2563023.0,1,2.376173,-0.435701,-0.394237,-0.067309


In [17]:
mooc[(mooc['user_id'] == 1812)
     & (mooc['item_id'] == 8)]

Unnamed: 0,user_id,item_id,timestamp,state_label,comma_separated_list_of_features,Unnamed: 5,Unnamed: 6,Unnamed: 7
37425,1812,8,253904.0,0,-0.319991,-0.435701,0.106784,-0.067309
37606,1812,8,255054.0,0,-0.319991,-0.435701,0.106784,-0.067309
37618,1812,8,255217.0,1,-0.319991,-0.435701,0.106784,-0.067309
