In [2]:
from py_files import get_crew_from_api
crew_df = get_crew_from_api.main()

Pulled crew data from Pixel Starships API.
Wrote data to excel file.
Getting sprite images...
sprites/387.png
sprites/9064.png
sprites/9754.png
Wrote data to Wordpress db.
Here's your data frame!


In [None]:
# GET PRESTIGE DATA
# from py_files import get_prestige_from_api
# prestige_df = get_prestige_from_api.main(crew_df['CharacterDesignId'].values)

In [None]:
# GET SPRITE IMAGES
import urllib
import os.path
baseSprite_url = 'http://apibackup.pixelstarships.com/FileService/DownloadSprite?spriteId='

for index, row in crew_df.iterrows():
    # head
    url = baseSprite_url + str(row['Head'])
    filename = 'sprites/' + str(row['Head']) + '.png'
    if os.path.isfile(filename) == False:
        urllib.request.urlretrieve(url, filename)
    
    # body
    url = baseSprite_url + str(row['Body'])
    filename = 'sprites/' + str(row['Body']) + '.png'
    if os.path.isfile(filename) == False:
        urllib.request.urlretrieve(url, filename)
        print(filename)
    
    # leg
    url = baseSprite_url + str(row['Leg'])
    filename = 'sprites/' + str(row['Leg']) + '.png'
    if os.path.isfile(filename) == False:
        urllib.request.urlretrieve(url, filename)
        print(filename)

In [None]:
from py_files import get_manual_grades
grades_df = get_manual_grades.main()

In [None]:
from py_files import prep_model_features
feats_df = prep_model_features.main(crew_df)

In [None]:
from py_files import create_model_sets
train_features, train_labels, test_features, test_labels = create_model_sets.main(feats_df, grades_df, 'Engineer')

# Model evaluation function

In [None]:
def eval_model(model, role, feats_df, grades_df):
    import matplotlib.pyplot as plt
    %matplotlib inline
    # import seaborn as sns
    # sns.set()
    
    train_features, train_labels, test_features, test_labels = create_model_sets.main(feats_df, grades_df, role, p=False)
    
    # run the model on the train/test sets    
    model.fit(train_features, train_labels)
    print("\n The model achieves an R2 value of " + str(model.score(test_features, test_labels)) + " on the " + role + " test set.")
    y_pred_test = model.predict(test_features)
    
    # for each character in the grades list, put the label in the "grade" column
    df = feats_df.copy()
    df['grade'] = None  # default is no grade
       
    for ind in range(len(grades_df)):
        g = grades_df[f"{role + 'Input'}"].values[ind]
        df.set_value(df.CharacterDesignId == grades_df['CharacterDesignId'].values[ind], 'grade', g)
    
    graded_crew = df[df.grade.isnull() == False].reset_index(drop=True)
    y_pred_all = model.predict(graded_crew.drop(['CharacterDesignId', 'grade'], axis=1))
    
    # Plotting
    plt.figure(figsize = (40,10))
    boundaries = [1.5, 2.5, 3.5]
    
    # plot test grades
    plt.subplot(1,3,1)
    plt.title('Test Set Grading')
    plt.xlabel('manual grades')
    plt.ylabel('model grades')
    plt.scatter(test_labels+0.1-np.random.uniform(size=len(y_pred_test))*0.2, y_pred_test, marker ='+', color = 'blue')
    for v in boundaries:
        plt.plot([0,5], [v,v], color='gray')
    
    # plot all grades
    plt.subplot(1,3,2)
    plt.title('All Crew Grading')
    plt.xlabel('manual grades')
    plt.ylabel('model grades')
    plt.scatter(graded_crew['grade']+0.1-np.random.uniform(size=len(y_pred_all))*0.2, y_pred_all, marker ='+', color = 'blue')
    for v in boundaries:
        plt.plot([0,5], [v,v], color='gray')
        
    plt.show()
    
    return graded_crew['CharacterDesignId'], graded_crew['grade'], y_pred_all;


    # print labels that would be written wrong (wouldn't round to the right grade)
    for ind in range(len(y_pred_all)):
        if graded_crew['grade'].values[ind] > 1.9:  # only worry about 2,3,4 grades. 0 and 1 aren't going into the post
            if round(y_pred_all[ind], 0) != graded_crew['grade'].values[ind]:
                print('Prediction error: CrewId' + str(df['CharacterDesignId'].values[ind]) 
                      + ' should be ' + str(df['grade'].values[ind]) + ' but the model predicts ' + str(round(y_pred_all[ind], 2)))

In [None]:
def show_errors(ids, manual_grades, model_grades, crew_df):
        for ind in range(len(ids)):
            if manual_grades[ind] > 1.9:  # only worry about 2,3,4 grades. 0 and 1 aren't going into the post
                if round(model_grades[ind], 0) != manual_grades[ind]:
                    # bad grading
                    crewloc = crew_df['CharacterDesignId'].values.tolist().index(ids[ind])
                    crewname = crew_df['CharacterDesignName'].values[crewloc]
                    print('Prediction error: ' + crewname + 
                          ' should be ' + str(manual_grades[ind]) + ' but the model predicts ' + 
                          str(round(model_grades[ind], 2)))

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np

# linear regression model
Regmodel = LinearRegression()
ids, manual_g, model_g = eval_model(Regmodel, 'Gunner', feats_df, grades_df)
show_errors(ids, manual_g, model_g, crew_df)

ids, manual_g, model_g = eval_model(Regmodel, 'Shielder', feats_df, grades_df)
show_errors(ids, manual_g, model_g, crew_df)

ids, manual_g, model_g = eval_model(Regmodel, 'Engineer', feats_df, grades_df)
show_errors(ids, manual_g, model_g, crew_df)

ids, manual_g, model_g = eval_model(Regmodel, 'Pilot', feats_df, grades_df)
show_errors(ids, manual_g, model_g, crew_df)

In [None]:
# what are the most important features?
coefs = Regmodel.coef_
top_feat_inds = np.fliplr([np.argsort(coefs)[-7:]])[0]
bottom_feat_inds = np.argsort(coefs)[0:7]

# features that bring rating up
for i in top_feat_inds:
    print("The feature " + feats_df.columns.tolist()[i+1] +   # one label later, since we dropped CrewId after model_data
          " has a coefficient of " +  str(coefs[i]))
print("\n")
# features that bring rating down
for i in bottom_feat_inds:
    print("The feature " + feats_df.columns.tolist()[i+1] + 
          " has a coefficient of " +  str(coefs[i]))

# kNN

In [None]:
# linear regression model
from sklearn.neighbors import KNeighborsClassifier

kNN = KNeighborsClassifier(n_neighbors = 1)
ids, manual_g, model_g = eval_model(kNN, 'Gunner', feats_df, grades_df)
show_errors(ids, manual_g, model_g, crew_df)

ids, manual_g, model_g = eval_model(kNN, 'Shielder', feats_df, grades_df)
show_errors(ids, manual_g, model_g, crew_df)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

k_values = [1,3,5,7,9]

for k in k_values:  
    
    kNN = KNeighborsClassifier(n_neighbors = k)
    kNN.fit(train_features, train_labels)
    
    # take a stab at the test set
    y_pred = kNN.predict(test_features)
    # accuracy
    acc = round(sum(y_pred == test_labels)/test_labels.shape[0],3)*100
    
    # what if you consider +/- 1 star to be still "accurate"?
    acc2 = round((
        sum(y_pred == test_labels) + 
        sum(y_pred == test_labels-1) + sum(y_pred == test_labels+1)
        )/test_labels.shape[0],3)*100
    print("The kNN classifier with k=" + str(k) + " had an accuracy of " 
          + str(acc) + "% (and a +/-1 grade accuracy of " + str(acc2) + "%)." )

In [None]:
# use the best model: n = 1
kNN = KNeighborsClassifier(n_neighbors = 1)
kNN.fit(train_features, train_labels)

# print the values the model gets wrong
print("\nThe ratings misclassified by the classifier were: ")
y_pred = kNN.predict(test_features)
errors = test_labels[y_pred != test_labels]
print(str(y_pred[y_pred != test_labels]) + "-- predicted values")
print(str(errors) + "-- actual values")

In [None]:
plt.figure(figsize = (20,5))
plt.subplot(1,3,2)
plt.scatter(test_labels+0.1-np.random.uniform(size=len(y_pred))*0.2, y_pred+0.1-np.random.uniform(size=len(y_pred))*0.2)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np

# logistic regression model
LogRegmodel = LogisticRegression(multi_class = 'multinomial', solver='newton-cg')
LogRegmodel.fit(train_features, train_labels)

print("The logistic regression model achieves an R2 value of " + str(LogRegmodel.score(test_features, test_labels)))

# print the values the model gets wrong
print("\nThe ratings misclassified by the classifier were: ")
y_pred = LogRegmodel.predict(test_features)
errors = test_labels[y_pred != test_labels]
print(str(y_pred[y_pred != test_labels]) + "-- predicted values")
print(str(errors) + "-- actual values")

In [None]:
plt.figure(figsize = (20,5))
plt.subplot(1,3,2)
plt.scatter(test_labels+0.1-np.random.uniform(size=len(y_pred))*0.2, y_pred+0.1-np.random.uniform(size=len(y_pred))*0.2)

# Perceptron

In [None]:
from sklearn.linear_model import Perceptron
perceptron = Perceptron()
perceptron.fit(train_features, train_labels)

print("The perceptron model achieves an R2 value of " + str(perceptron.score(test_features, test_labels)))
y_pred = perceptron.predict(test_features)

# print the values the model gets wrong
print("\nThe ratings misclassified by the classifier were: ")
y_pred = perceptron.predict(test_features)
errors = test_labels[y_pred != test_labels]
print(str(y_pred[y_pred != test_labels]) + "-- predicted values")
print(str(errors) + "-- actual values")

# SVM

In [None]:
from sklearn.svm import SVC
svc = SVC(kernel='poly', degree=8)
svc.fit(train_features, train_labels)

print("The svm model achieves an R2 value of " + str(svc.score(test_features, test_labels)))
y_pred = svc.predict(test_features)

In [None]:
plt.figure(figsize = (20,5))
plt.subplot(1,3,2)
plt.scatter(test_labels+0.1-np.random.uniform(size=len(y_pred))*0.2, y_pred+0.1-np.random.uniform(size=len(y_pred))*0.2)

 # Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(train_features, train_labels)

print("The decision tree achieves an R2 value of " + str(tree.score(test_features, test_labels)))
y_pred = tree.predict(test_features)

In [None]:
plt.figure(figsize = (20,5))
plt.subplot(1,3,2)
plt.scatter(test_labels+0.1-np.random.uniform(size=len(y_pred))*0.2, y_pred+0.1-np.random.uniform(size=len(y_pred))*0.2)