In [43]:
import pandas as pd
import tensorflow as tf
import ast

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", palette="pastel", color_codes=True)
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score

In [44]:
data=pd.read_csv('Data/games_detailed_info.csv',low_memory=False)

## Split and Clean Columns

In [94]:
data=data[data.bayesaverage !=0]
data=data[data.averageweight !=0]

# Categorize a game as good is it has a rating of 6+
def good(x):
    if x >= 6.5:
        return 1
    else: return 0
    
data['good']=data['bayesaverage'].apply(good)

In [95]:
ownership_df=data[['id','bayesaverage','good','owned','trading','wanting','wishing']]
ratings_df=data[['id','bayesaverage','good','averageweight','numcomments','numweights','usersrated']]

In [96]:
data_other=data[['id', 'bayesaverage', 'good', 'primary', 'boardgameartist', 'boardgamecategory',
                'boardgamedesigner', 'boardgamemechanic', 'boardgamepublisher',
                'maxplayers', 'minplayers', 'maxplaytime', 'playingtime', 'minplaytime',  'thumbnail', 'yearpublished']]
data_other=data_other.dropna(how='any')

In [97]:
mechanics_df=data_other[['id', 'good', 'boardgamemechanic']]

In [98]:
mechanics_df['boardgamemechanic']=mechanics_df['boardgamemechanic'].apply(ast.literal_eval)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [101]:
mlb = MultiLabelBinarizer()

mechanics_df = mechanics_df.join(
        pd.DataFrame( 
            mlb.fit_transform(mechanics_df['boardgamemechanic']),
            index=mechanics_df.index,
            columns=mlb.classes_))

In [102]:
mechanics_df

Unnamed: 0,id,good,boardgamemechanic,Acting,Action / Movement Programming,Action Point Allowance System,Area Control / Area Influence,Area Enclosure,Area Movement,Area-Impulse,...,Storytelling,Take That,Tile Placement,Time Track,Trading,Trick-taking,Variable Phase Order,Variable Player Powers,Voting,Worker Placement
0,13,1,"[Dice Rolling, Modular Board, Route/Network Bu...",0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,822,1,"[Area Control / Area Influence, Tile Placement]",0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,30549,1,"[Action Point Allowance System, Cooperative Pl...",0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,68448,1,"[Card Drafting, Hand Management, Set Collectio...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,36218,1,"[Card Drafting, Deck / Pool Building, Hand Man...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17044,8654,0,"[Roll / Spin and Move, Tile Placement]",0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
17047,10611,0,[Hex-and-Counter],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17048,237834,0,[Simultaneous Action Selection],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17051,3628,0,"[Memory, Set Collection]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [103]:
# Remove target from features data
y = mechanics_df.good
X = mechanics_df.drop(['good','id','boardgamemechanic'],axis=1)

In [104]:
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [105]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  10
hidden_nodes_layer2 = 5


nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, 
                             input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
nn.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 10)                540       
_________________________________________________________________
dense_10 (Dense)             (None, 5)                 55        
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 6         
Total params: 601
Trainable params: 601
Non-trainable params: 0
_________________________________________________________________


In [106]:
# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss1, model_accuracy1 = nn.evaluate(X_test_scaled,y_test)
print(f"Loss: {model_loss1}, Accuracy: {model_accuracy1}")

Train on 8253 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Loss: 0.26054528124849824, Accuracy: 0.9116684794425964


In [132]:
# Logistic regression
model = LogisticRegression(solver='lbfgs', random_state=926)
model.fit(X_train_scaled, y_train)

LogisticRegression(random_state=926)

In [133]:
# Display the confusion matrix
y_pred = model.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)

array([[2519,   12],
       [ 217,    3]], dtype=int64)

In [134]:
balanced_accuracy_score(y_test, y_pred)

0.5044475773140332

In [135]:
weights=pd.DataFrame()
weights['mech']=X.columns
weights['weight']=model.coef_[0]
weights.sort_values('weight', ascending=False)

Unnamed: 0,mech,weight
18,Hand Management,0.285121
10,Card Drafting,0.241226
52,Worker Placement,0.220279
50,Variable Player Powers,0.206273
38,Set Collection,0.198672
3,Area Control / Area Influence,0.195864
17,Grid Movement,0.192112
36,Route/Network Building,0.186031
16,Dice Rolling,0.162213
40,Simultaneous Action Selection,0.145986
