<a href="https://colab.research.google.com/github/Kartick-rocks/SportsVideoData/blob/main/datamodeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Importing the necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

#Importing the required dataset. X represents the matrix of independent features, y represents the dependent variable
dataset = pd.read_csv('Rawvideo_data.csv')
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

#Deal with missing data using the SimpleImputer function and filling the column with just the mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X.iloc[:, [2]])
X.iloc[:, [2]] = imputer.transform(X.iloc[:, [2]])

#Encode categorical input variables 
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 1, 3, 4, 5])], remainder='passthrough')
X = np.array(ct.fit_transform(X))


fields = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'Age'] 

#Encode the categorical output variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)


filename = "processedInputData.csv"
    
#writing to csv file 
with open(filename, 'w') as csvfile: 
#    # creating a csv writer object 
     csvwriter = csv.writer(csvfile) 
        
     csvwriter.writerow(fields)

     csvwriter.writerows(X)     

     #csvwriter.writerow(y)
     csvfile.close()
     #csvwriter.writerows(map(lambda x: [x], y))


df = pd.read_csv('processedInputData.csv')
new_column = pd.DataFrame({'Viewed': y})
df = df.merge(new_column, left_index = True, right_index = True)
df.to_csv('processedData.csv', index = False)


#Python function for importing the required dataset and obtaining the required parameters
def obtainParams(): 
  dataset = pd.read_csv('processedData.csv')
  X = dataset.iloc[:, :-1]
  y = dataset.iloc[:, -1]
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
  return X_train, X_test, y_train, y_test
  

In [2]:
#Python function to run the Logistic Regression model and to return the accuracy score on the test data
def RunLogisticRegression():
  X_train, X_test, y_train, y_test = obtainParams()
  classifier = LogisticRegression(random_state=1)
  classifier.fit(X_train, y_train)
  #Predict the test set results using Logistic Regression
  y_pred = classifier.predict(X_test)
  #print(y_pred.tolist())
  #print(y_test.values.reshape(1, -1).ravel().tolist())
  cm = confusion_matrix(y_test, y_pred)
  score = accuracy_score(y_test, y_pred)
  print(cm)
  return score 


In [3]:
#Python function to run the KNeighborsClassifier and to return the accuracy score on the test data
def RunKNN():
  X_train, X_test, y_train, y_test = obtainParams()
  classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
  classifier.fit(X_train, y_train)
  #Predict the test set results using Logistic Regression
  y_pred = classifier.predict(X_test)
  #print(y_pred.tolist())
  #print(y_test.values.reshape(1, -1).ravel().tolist())
  cm = confusion_matrix(y_test, y_pred)
  score = accuracy_score(y_test, y_pred)
  print(cm)
  return score

In [4]:
#Python function to run the SVM classifier and to return the accuracy score on the test data
def RunSVM():
  X_train, X_test, y_train, y_test = obtainParams()
  classifier = SVC(kernel='linear', random_state=0)
  classifier.fit(X_train, y_train)
  #Predict the test set results using Logistic Regression
  y_pred = classifier.predict(X_test)
  #print(y_pred.tolist())
  #print(y_test.values.reshape(1, -1).ravel().tolist())
  cm = confusion_matrix(y_test, y_pred)
  score = accuracy_score(y_test, y_pred)
  print(cm)
  return score

In [5]:
#Python function to run the Decision tree classifier and to return the accuracy score on the test data
def RunDecisionTreeClassifier():
  X_train, X_test, y_train, y_test = obtainParams()
  classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
  classifier.fit(X_train, y_train)
  #Predict the test set results using Logistic Regression
  y_pred = classifier.predict(X_test)
  #print(y_pred.tolist())
  #print(y_test.values.reshape(1, -1).ravel().tolist())
  cm = confusion_matrix(y_test, y_pred)
  score = accuracy_score(y_test, y_pred)
  print(cm)
  return score

In [6]:
#Python function to run the Random Forest classifier and to return the accuracy score on the test data
def RunRandomForestClassifier():
  X_train, X_test, y_train, y_test = obtainParams()
  classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
  classifier.fit(X_train, y_train)
  #Predict the test set results using Logistic Regression
  y_pred = classifier.predict(X_test)
  #print(y_pred.tolist())
  #print(y_test.values.reshape(1, -1).ravel().tolist())
  cm = confusion_matrix(y_test, y_pred)
  score = accuracy_score(y_test, y_pred)
  print(cm)
  return score

In [7]:
score1 = RunLogisticRegression()
score2 = RunKNN()
score3 = RunSVM()
score4 = RunDecisionTreeClassifier()
score5 = RunRandomForestClassifier()

print(score1)
print(score2)
print(score3)
print(score4)
print(score5)


[[6 6]
 [5 3]]
[[4 8]
 [6 2]]
[[7 5]
 [5 3]]
[[5 7]
 [4 4]]
[[8 4]
 [7 1]]
0.45
0.3
0.5
0.45
0.45
