In [1]:
from py_files import get_crew_from_api
crew_df = get_crew_from_api.main()

Pulled crew data from Pixel Starships API.
Wrote data to excel file.
Wrote data to Wordpress db.
Here's your data frame!


In [2]:
from py_files import get_prestige_from_api
prestige_df = get_prestige_from_api.main(crew_df['CharacterDesignId'].values)

Downloading prestige data...
Setting up data frame...
Saving to wordpress database...
Done!


In [3]:
prestige_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6289 entries, 0 to 52
Data columns (total 3 columns):
CharacterDesignId1     6289 non-null int32
CharacterDesignId2     6289 non-null int32
ToCharacterDesignId    6289 non-null int32
dtypes: int32(3)
memory usage: 122.8 KB


In [4]:
from py_files import get_manual_grades
grades_df = get_manual_grades.main()

retrieved manual grades from wordpress


In [5]:
from py_files import prep_model_features
feats_df = prep_model_features.main(crew_df)

Normalizing crews' ability scores...
Creating binary category variables...
Scaling numeric variables...
Ready for modeling!


In [6]:
from py_files import create_model_sets
train_features, train_labels, test_features, test_labels = create_model_sets.main(feats_df, grades_df, 'Shielder')

Matching up grades with crew...
Removing ungraded crew...
The size of the model-able data is:
(276, 40)
Randomly selecting data sets...
We will use 220 training points and 55 test points.


# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np

# linear regression model
Regmodel = LinearRegression()
Regmodel.fit(train_features, train_labels)

print("The linear regression model achieves an R2 value of " + str(Regmodel.score(test_features, test_labels)))

y_pred = Regmodel.predict(test_features)
print("At worst, the predicted score was off by " + str(round(np.max(y_pred - test_labels),3)) + " grades.\n")

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

plt.figure(figsize = (20,5))
plt.subplot(1,3,2)
plt.scatter(test_labels+0.1-np.random.uniform(size=len(y_pred))*0.2, y_pred)

In [None]:
# what are the most important features?
coefs = Regmodel.coef_
top_feat_inds = np.fliplr([np.argsort(coefs)[-7:]])[0]
bottom_feat_inds = np.argsort(coefs)[0:7]

# features that bring rating up
for i in top_feat_inds:
    print("The feature " + feats_df.columns.tolist()[i+1] +   # one label later, since we dropped CrewId after model_data
          " has a coefficient of " +  str(coefs[i]))
print("\n")
# features that bring rating down
for i in bottom_feat_inds:
    print("The feature " + feats_df.columns.tolist()[i+1] + 
          " has a coefficient of " +  str(coefs[i]))

# kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

k_values = [1,3,5,7,9]

for k in k_values:  
    
    kNN = KNeighborsClassifier(n_neighbors = k)
    kNN.fit(train_features, train_labels)
    
    # take a stab at the test set
    y_pred = kNN.predict(test_features)
    # accuracy
    acc = round(sum(y_pred == test_labels)/test_labels.shape[0],3)*100
    
    # what if you consider +/- 1 star to be still "accurate"?
    acc2 = round((
        sum(y_pred == test_labels) + 
        sum(y_pred == test_labels-1) + sum(y_pred == test_labels+1)
        )/test_labels.shape[0],3)*100
    print("The kNN classifier with k=" + str(k) + " had an accuracy of " 
          + str(acc) + "% (and a +/-1 grade accuracy of " + str(acc2) + "%)." )

In [None]:
# use the best model: n = 1
kNN = KNeighborsClassifier(n_neighbors = 1)
kNN.fit(train_features, train_labels)

# print the values the model gets wrong
print("\nThe ratings misclassified by the classifier were: ")
y_pred = kNN.predict(test_features)
errors = test_labels[y_pred != test_labels]
print(str(y_pred[y_pred != test_labels]) + "-- predicted values")
print(str(errors) + "-- actual values")

In [None]:
plt.figure(figsize = (20,5))
plt.subplot(1,3,2)
plt.scatter(test_labels+0.1-np.random.uniform(size=len(y_pred))*0.2, y_pred+0.1-np.random.uniform(size=len(y_pred))*0.2)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np

# logistic regression model
LogRegmodel = LogisticRegression(multi_class = 'multinomial', solver='newton-cg')
LogRegmodel.fit(train_features, train_labels)

print("The logistic regression model achieves an R2 value of " + str(LogRegmodel.score(test_features, test_labels)))

# print the values the model gets wrong
print("\nThe ratings misclassified by the classifier were: ")
y_pred = LogRegmodel.predict(test_features)
errors = test_labels[y_pred != test_labels]
print(str(y_pred[y_pred != test_labels]) + "-- predicted values")
print(str(errors) + "-- actual values")

In [None]:
plt.figure(figsize = (20,5))
plt.subplot(1,3,2)
plt.scatter(test_labels+0.1-np.random.uniform(size=len(y_pred))*0.2, y_pred+0.1-np.random.uniform(size=len(y_pred))*0.2)

# Perceptron

In [None]:
from sklearn.linear_model import Perceptron
perceptron = Perceptron()
perceptron.fit(train_features, train_labels)

print("The perceptron model achieves an R2 value of " + str(perceptron.score(test_features, test_labels)))
y_pred = perceptron.predict(test_features)

# print the values the model gets wrong
print("\nThe ratings misclassified by the classifier were: ")
y_pred = perceptron.predict(test_features)
errors = test_labels[y_pred != test_labels]
print(str(y_pred[y_pred != test_labels]) + "-- predicted values")
print(str(errors) + "-- actual values")

# SVM

In [None]:
from sklearn.svm import SVC
svc = SVC(kernel='poly', degree=8)
svc.fit(train_features, train_labels)

print("The svm model achieves an R2 value of " + str(svc.score(test_features, test_labels)))
y_pred = svc.predict(test_features)

In [None]:
plt.figure(figsize = (20,5))
plt.subplot(1,3,2)
plt.scatter(test_labels+0.1-np.random.uniform(size=len(y_pred))*0.2, y_pred+0.1-np.random.uniform(size=len(y_pred))*0.2)

 # Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(train_features, train_labels)

print("The decision tree achieves an R2 value of " + str(tree.score(test_features, test_labels)))
y_pred = tree.predict(test_features)

In [None]:
plt.figure(figsize = (20,5))
plt.subplot(1,3,2)
plt.scatter(test_labels+0.1-np.random.uniform(size=len(y_pred))*0.2, y_pred+0.1-np.random.uniform(size=len(y_pred))*0.2)