In [112]:
import numpy as np
from StringIO import StringIO
import string
import scipy 
from sklearn import datasets, linear_model
import math
import matplotlib.pyplot as plt
from sklearn import svm

In [100]:
# this imports our data
data = np.genfromtxt('movie_metadata_sanitized.csv', dtype=None, delimiter=',', autostrip=True)

# this is the uncropped data set
fullDataSet = data

# this chops out the director name, actor 2 name, actor 1 name, movie Title, actor 3 name, plot keywords,
# movie imbd link, language, country, content rating
data = np.delete(data, [2, 7, 11, 12, 15, 17, 18, 20, 21, 22], 1)

# reducing "data" matrix to only numeric data 
onlyNumericData  = np.delete(data, [1,10,12,13], 1)

#vector containing feature names 
colNames = data[0,:]

# cuts out feature label row
movieData = np.delete(data, (0), axis=0)

# finds length,numMovies, and width,numFeatures, of dataset and prints
numMovies = movieData.shape[0]
numFeatures = movieData.shape[1]
print numMovies 
print numFeatures

3756
19


In [101]:
# This function provides an easy way of indexing our data set. 

#The arguments are the movieID, the feature you want or 0 for all features , followed by a boolean 1 or 0 for 
# the cropped data set "data" or the uncropped data set "fullDataSet"

#
# example function calls are: easydex(112,'movieID',1)  
#         This returns the 112 movie's movieID which is 112. 
# example function calls are: easydex(460,0,1)
#         This gives all the data on the 460th movie. 
# example function calls are: easydex(460,0,0)
#         This gives all the data on the 460th movie from the "fullDataSet" 
#         instead of the cropped "data" matrix.

def easydex(MovieRow, FeatureString, selectDataSet):
    
    if selectDataSet == 1:
        dataSet = data
    elif selectDataSet == 2:
        dataSet = onlyNumericData 
    else: 
        dataSet = fullDataSet
    
    MovieRow = MovieRow -1
    my_data_dict= {}
   
    col_names = dataSet[0,:]
   
    # returns specific feature for a specific movie
    if FeatureString != 0:
        for l in range(0,numFeatures):
            my_data_dict[col_names[l]]= dataSet[1:,l]
   
        example1 = my_data_dict[FeatureString]
        return[example1[MovieRow]]
    
    #returns whole row for a specific movie
    else:
        my_data_neat_index = dataSet[1:,:]
        myMovieSample = my_data_neat_index[MovieRow,:]
        return myMovieSample

In [102]:
genres = {'Action', 'Adventure','Animation','Biography','Comedy','Crime','Documentary','Drama','Family','Fantasy',
          'Film-Noir','Game-Show','History','Horror','Music','Musical','Mystery','News','Reality-TV','Romance','Sci-Fi',
          'Short','Sport','Thriller','War','Western'}

moviesByGenreBucket = {}

for genre in genres:
    moviesByGenreBucket[genre] = []

In [103]:
# buckets movieIDs into genre buckets
for movie in movieData:    
    localGenres = string.split(movie[8], "|")
    for genre in localGenres:
        #print "adding movieID = %s to genre bucket %s" % (movie[0], genre)
        moviesByGenreBucket[genre].append(movie[0])
    
    # one hot coding for color feature
    if movie[1] == 'Color':
        movie[1] = 1
    else:
        movie[1] = 0

# remove genre column now that we have consumed it
movieData = np.delete(movieData, 8, 1)

# cast dytpe to float
movieData = movieData.astype(np.float)

In [131]:
# Zscore my data
stats.zscore(movieData, axis=0)

array([[-0.94648502,  0.18477275,  0.3452948 , ..., -0.06181407,
         0.67697286, -0.43587211],
       [ 0.51048827,  0.18477275,  1.89266032, ...,  0.69568556,
         0.67697286,  1.98723707],
       [-0.99628972,  0.18477275, -0.54585813, ..., -1.57681331,
         0.67697286, -0.40842574],
       ..., 
       [ 0.51866517,  0.18477275, -0.14889001, ...,  2.40005971,
        -0.73937356,  0.21650344],
       [-1.26315371,  0.18477275,  1.7468353 , ...,  0.03287339,
         0.67697286,  2.17363008],
       [-1.15908419,  0.18477275, -0.76459566, ..., -1.10337605,
        -0.73937356, -0.40353292]])

In [132]:
# Split data into Test, Train
dataSplitRate = 0.7
numSamples = int(math.floor(numMovies * dataSplitRate))
np.random.shuffle(movieData)
train = movieData[:numSamples]
test = movieData[numSamples:]
trainX = np.delete(train, 15, axis=1)
trainY = train[:,15]
testX = np.delete(test, 15, axis=1)
testY = test[:,15]

In [133]:
# linear regression
regr = linear_model.LinearRegression()
regr.fit(trainX, trainY)

# The coefficients
print('Coefficients: \n', regr.coef_)

# The mean squared error
print("Linear Regrssion Model RMSE : %.2f"
      % math.sqrt(np.mean((regr.predict(testX) - testY) ** 2)))

# use L1 Norm
print("Linear Regrssion Model RMSE : %.2f"
      % math.sqrt(np.mean((scipy.spatial.distance.cityblock(regr.predict(testX), testY)) ** 2)))


# Error of model that naively guesses the mean 
print ("Naive model RMSE: %.2f"
       % math.sqrt(np.mean(testY - np.mean(trainY)) ** 2))

# Explained variance score: 1 is perfect prediction (r^2)
print('R^2 score: %.2f' % regr.score(testX, testY))

# Linear Regrssion Model RMSE : 0.83
# Naive model RMSE: 0.07
# R^2 score: 0.38

('Coefficients: \n', array([  1.98206735e-04,  -2.91185561e-01,   2.59787335e-03,
         1.25826444e-02,   6.34425387e-06,   8.25016949e-05,
         8.06577792e-05,  -2.40664334e-10,   3.74387112e-06,
        -7.93563345e-05,  -1.28576197e-02,  -6.47933488e-04,
         1.53528494e-11,  -1.52241424e-02,   8.33588937e-05,
         1.15513711e-02,  -3.86439434e-06]))
Linear Regrssion Model RMSE : 0.85
Linear Regrssion Model RMSE : 709.08
Naive model RMSE: 0.05
R^2 score: 0.41


In [134]:
# SVM
#clf = svm.SVC()
#clf.fit(trainX, trainY)

In [135]:
cov = np.cov(trainX, rowvar = False)
print cov.shape
print cov

(17, 17)
[[  1.81849930e+06  -2.20171851e+01  -5.09057372e+04  -8.32480577e+03
   -4.74375373e+05  -4.42992632e+05  -3.00921678e+06  -4.44460438e+10
   -6.18996854e+07  -5.04220338e+06   3.84017657e+00  -1.55435003e+05
   -3.37919477e+10  -2.45122714e+03  -1.20008043e+06  -9.77948021e+01
   -4.96349021e+06]
 [ -2.20171851e+01   3.09421955e-02   2.18556575e-02  -1.67693152e-01
   -2.99351829e+01   4.94263087e+00   7.32301743e+01   6.00703024e+05
   -7.46644891e+02   9.26558204e+01  -2.50831812e-03  -5.10398940e+00
    7.61952601e+05   3.29681292e-01   7.31715707e+00   5.00019974e-03
    5.44926079e+01]
 [ -5.09057372e+04   2.18556575e-02   1.49743450e+04   6.71281859e+02
    6.89174645e+04   5.56255153e+04   3.04080925e+05   3.82827571e+09
    1.09625236e+07   5.29936290e+05  -9.00828642e+00   2.78897398e+04
    3.16267122e+09   5.00861844e+02   1.37482188e+05   8.03328664e+00
    1.76853614e+06]
 [ -8.32480577e+03  -1.67693152e-01   6.71281859e+02   4.79706816e+02
    1.27458808e+04   

In [130]:
sum1 = 0
sum2 = 0
for indx in np.arange(testX.shape[0]):
    sum1 += scipy.spatial.distance.cityblock(regr.predict(testX)[indx], testY[indx])
    sum2 += scipy.spatial.distance.cityblock(regr.predict(testX)[indx], np.mean(trainY))
print sum1/testX.shape[0]
print sum2/testX.shape[0]

0.603828151899
0.491688009513
