<a href="https://colab.research.google.com/github/MadeaRiggs/AIPlanet-Deep-Learning-projects/blob/main/AIPlanet_Sports_Theme_Identification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os # OS module in Python provides a way of using operating system dependent functionality
import cv2 # Library for image processing
from sklearn.model_selection import train_test_split # For splitting the data into train and validation set

In [None]:
!wget https://s3.us-west-1.wasabisys.com/dphi/public-datasets/Data%20Sprint%2080%20Sports%20Genre%20Identification/Sports_Identification.zip

--2023-02-04 12:16:14--  https://s3.us-west-1.wasabisys.com/dphi/public-datasets/Data%20Sprint%2080%20Sports%20Genre%20Identification/Sports_Identification.zip
Resolving s3.us-west-1.wasabisys.com (s3.us-west-1.wasabisys.com)... 38.146.40.11, 38.146.40.17, 38.146.40.12, ...
Connecting to s3.us-west-1.wasabisys.com (s3.us-west-1.wasabisys.com)|38.146.40.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 352952524 (337M) [application/x-zip-compressed]
Saving to: ‘Sports_Identification.zip’


2023-02-04 12:16:25 (34.2 MB/s) - ‘Sports_Identification.zip’ saved [352952524/352952524]



In [None]:
!apt-get install p7zip-full

Reading package lists... Done
Building dependency tree       
Reading state information... Done
p7zip-full is already the newest version (16.02+dfsg-7build1).
The following package was automatically installed and is no longer required:
  libnvidia-common-510
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 27 not upgraded.


In [None]:
!7za x /content/Sports_Identification.zip


7-Zip (a) [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan /content/                   1 file, 352952524 bytes (337 MiB)

Extracting archive: /content/Sports_Identification.zip
 30% 4096 Open              --
Path = /content/Sports_Identification.zip
Type = zip
Physical Size = 352952524

  0%      3% 431 - Sports Identification/train/Image_9339.jpg                                                       5% 766 - Sports Identification/test/Image_3685.jpg                                                      6% 921 - Sports Identi

In [None]:

#load data
labels= pd.read_csv("/content/Sports Identification/Training_set.csv")
labels.head()

Unnamed: 0,filename,label
0,Image_1.jpg,golf
1,Image_2.jpg,roller derby
2,Image_3.jpg,high jump
3,Image_4.jpg,bungee jumping
4,Image_5.jpg,rugby


In [None]:
#check for unique values
labels['label'].unique()

array(['golf', 'roller derby', 'high jump', 'bungee jumping', 'rugby',
       'sumo wrestling', 'billiards', 'judo', 'figure skating pairs',
       'baseball', 'hockey', 'fencing', 'frisbee', 'snow boarding',
       'cheerleading', 'hammer throw', 'jai alai', 'skydiving', 'hurdles',
       'chuckwagon racing', 'canoe slamon', 'figure skating men',
       'balance beam', 'steer wrestling', 'baton twirling',
       'snowmobile racing', 'luge', 'swimming', 'pole vault', 'boxing',
       'table tennis', 'football', 'track bicycle', 'nascar racing',
       'polo', 'air hockey', 'hang gliding', 'weightlifting',
       'parallel bar', 'hydroplane racing', 'horse jumping', 'tug of war',
       'croquet', 'ice yachting', 'shot put', 'field hockey', 'tennis',
       'formula 1 racing', 'rings', 'gaga', 'cricket',
       'horseshoe pitching', 'archery', 'sidecar racing', 'curling',
       'disc golf', 'bike polo', 'olympic wrestling', 'ski jumping',
       'pole climbing', 'sailboat racing', 'mus

In [None]:
#Getting images file path
file_paths = [[fname, '/content/Sports Identification/train/' + fname] for fname in labels['filename']]

In [None]:
#Confirming if no. of labels is equal to no. of images
if len(labels) == len(file_paths):
  print('Number of labels i.e. ', len(labels), 'matches the number of filenames i.e. ', len(file_paths))
else:
  print('Number of labels does not match the number of filenames')

Number of labels i.e.  9500 matches the number of filenames i.e.  9500


In [None]:
#Converting the file_paths to dataframe
images = pd.DataFrame(file_paths, columns=['filename', 'filepaths'])
images.head()

Unnamed: 0,filename,filepaths
0,Image_1.jpg,/content/Sports Identification/train/Image_1.jpg
1,Image_2.jpg,/content/Sports Identification/train/Image_2.jpg
2,Image_3.jpg,/content/Sports Identification/train/Image_3.jpg
3,Image_4.jpg,/content/Sports Identification/train/Image_4.jpg
4,Image_5.jpg,/content/Sports Identification/train/Image_5.jpg


In [None]:
#Combining the labels with the images
train_data = pd.merge(images, labels, how = 'inner', on = 'filename')
train_data.head()

Unnamed: 0,filename,filepaths,label
0,Image_1.jpg,/content/Sports Identification/train/Image_1.jpg,golf
1,Image_2.jpg,/content/Sports Identification/train/Image_2.jpg,roller derby
2,Image_3.jpg,/content/Sports Identification/train/Image_3.jpg,high jump
3,Image_4.jpg,/content/Sports Identification/train/Image_4.jpg,bungee jumping
4,Image_5.jpg,/content/Sports Identification/train/Image_5.jpg,rugby


In [None]:
#converting categorical data to numerical data
from sklearn.preprocessing import LabelEncoder
label_encoded = LabelEncoder()
train_data['label'] = label_encoded.fit_transform(train_data['label'])

In [None]:
train_data.head()

Unnamed: 0,filename,filepaths,label
0,Image_1.jpg,/content/Sports Identification/train/Image_1.jpg,36
1,Image_2.jpg,/content/Sports Identification/train/Image_2.jpg,68
2,Image_3.jpg,/content/Sports Identification/train/Image_3.jpg,40
3,Image_4.jpg,/content/Sports Identification/train/Image_4.jpg,17
4,Image_5.jpg,/content/Sports Identification/train/Image_5.jpg,71


## Data Pre-processing

In [None]:
data = [] # initialize an empty numpy array
image_size = 100 # image size taken is 100 here. one can take other size too
for i in range(len(train_data)):

  img_array = cv2.imread(train_data['filepaths'][i], cv2.IMREAD_GRAYSCALE) # converting the image to gray scale

  new_img_array = cv2.resize(img_array, (image_size, image_size),cv2.INTER_AREA) # resizing the image array
  data.append([new_img_array, train_data['label'][i]])
  

In [None]:
data= np.array(data, dtype=object)

In [None]:
data.shape

(9500, 2)

In [None]:
#Shuffle the data
np.random.shuffle(data)

In [None]:
#Separating the images and labels
x = []
y = []
for image in data:
  x.append(image[0])
  y.append(image[1])

# converting x & y to numpy array as they are list
x = np.array(x)
y = np.array(y)

In [None]:
x = x.reshape(-1, 100, 100, 1)

In [None]:
train_data['label'].nunique()

100

In [None]:
np.unique(y, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
        85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]),
 array([ 78,  78,  92,  69,  79, 106,  86, 122, 118,  76,  78, 101,  98,
         97,  84,  81, 104,  88, 115,  92,  84,  91,  95,  99,  86,  95,
        111, 102, 106, 111,  94, 134, 134,  93,  83, 106, 110,  85,  85,
        103, 111, 120,  96,  97,  71,  96,  76,  91,  78,  92,  95,  83,
        105, 106,  90,  89,  92,  78, 133, 117,  93,  83,  78,  97, 110,
         84, 117,  99, 114, 111, 100, 116, 101, 108,  91,  95,  91,  41,
        104,  82, 103,  96,  85,  93,  99, 110,  99,  92,  95,  92,  90,
         69, 111, 103,  72,  91,  94,  

In [None]:
#splitting dataset
X_train, X_val, y_train, y_val = train_test_split(x,y,test_size=0.2, random_state = 42)

In [None]:
X_train[0]

array([[[139],
        [142],
        [138],
        ...,
        [131],
        [129],
        [139]],

       [[181],
        [162],
        [173],
        ...,
        [137],
        [137],
        [137]],

       [[155],
        [167],
        [152],
        ...,
        [134],
        [124],
        [134]],

       ...,

       [[156],
        [150],
        [142],
        ...,
        [ 97],
        [ 83],
        [ 93]],

       [[148],
        [152],
        [149],
        ...,
        [ 98],
        [ 99],
        [101]],

       [[150],
        [149],
        [150],
        ...,
        [103],
        [ 98],
        [ 78]]], dtype=uint8)

In [None]:
#normalizing train data
X_train= X_train / 255
X_val= X_val / 255

In [None]:
X_train[0]

array([[[0.54509804],
        [0.55686275],
        [0.54117647],
        ...,
        [0.51372549],
        [0.50588235],
        [0.54509804]],

       [[0.70980392],
        [0.63529412],
        [0.67843137],
        ...,
        [0.5372549 ],
        [0.5372549 ],
        [0.5372549 ]],

       [[0.60784314],
        [0.65490196],
        [0.59607843],
        ...,
        [0.5254902 ],
        [0.48627451],
        [0.5254902 ]],

       ...,

       [[0.61176471],
        [0.58823529],
        [0.55686275],
        ...,
        [0.38039216],
        [0.3254902 ],
        [0.36470588]],

       [[0.58039216],
        [0.59607843],
        [0.58431373],
        ...,
        [0.38431373],
        [0.38823529],
        [0.39607843]],

       [[0.58823529],
        [0.58431373],
        [0.58823529],
        ...,
        [0.40392157],
        [0.38431373],
        [0.30588235]]])

In [None]:
#flattening data to 1D array
X_train_flattened= X_train.reshape(len(X_train), 100*100)
X_val_flattened= X_val.reshape(len(X_val), 100*100)

In [None]:
X_train_flattened[0]

array([0.54509804, 0.55686275, 0.54117647, ..., 0.40392157, 0.38431373,
       0.30588235])

In [None]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping

In [None]:
#building model
cnn = tf.keras.models.Sequential([
tf.keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation='relu', input_shape=(100, 100, 1)),
tf.keras.layers.MaxPooling2D((2, 2)),

tf.keras.layers.Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),
tf.keras.layers.MaxPooling2D((2, 2)),

#tf.keras.layers.Flatten(input_shape=(100, 100, 1)),
tf.keras.layers.Flatten(),

# add dense layers with some dropout
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dropout(rate=0.3),
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dropout(rate=0.3),

tf.keras.layers.Dense(100, activation='softmax')
])

In [None]:
#opt= Adam(learning_rate=0.1)
cnn.compile(
  optimizer= 'adam',
  loss='sparse_categorical_crossentropy',
  metrics=['accuracy']
)

In [None]:
cnn.fit(
    X_train_flattened,
    y_train, 
    epochs=20, 
    batch_size=64)

Epoch 1/20


ValueError: ignored

In [None]:
#Model evaluation
cnn.evaluate(X_val, y_val)

In [None]:
# Loading the order of the image's name that has been provided
test_image_order = pd.read_csv("/content/Sports Identification/Testing_set.csv")
test_image_order.head()

In [None]:
test_image_order.count()

In [None]:
#Getting images file path
testfile_paths = [[fname, '/content/Sports Identification/test/' + fname] for fname in test_image_order['filename']]

In [None]:
#Confirming if no. of labels is equal to no. of images
if len(test_image_order) == len(testfile_paths):
  print('Number of labels i.e. ', len(test_image_order), 'matches the number of filenames i.e. ', len(testfile_paths))
else:
  print('Number of labels does not match the number of filenames')

In [None]:
len(test_image_order)

In [None]:
len(testfile_paths)

In [None]:
test_images = pd.DataFrame(testfile_paths, columns=['filename', 'filepaths'])
test_images.tail()

In [None]:
#loading and resizing test images
test_pixel_data = [] # initialize an empty numpy array
image_size = 100 
missing_files = []

for i in range(len(test_images)):

  img_array = cv2.imread(test_images['filepaths'][i], cv2.IMREAD_GRAYSCALE) # converting the image to gray scale
  new_img_array = cv2.resize(img_array, (image_size, image_size)) # resizing the image array
  test_pixel_data.append(new_img_array) 

In [None]:
test_pixel_data = np.array(test_pixel_data)

In [None]:
test_pixel_data = test_pixel_data.reshape(-1, 100, 100, 1)

In [None]:
#predicting
pred = cnn.predict(test_pixel_data)
pred[0]

In [None]:
#taking the highest probabilities
prediction = []
for value in pred:
  prediction.append(np.argmax(value))

In [None]:
#convert numerical data to categorical data
predictions = label_encoded.inverse_transform(prediction)

In [None]:
predictions

In [None]:
res = pd.DataFrame({'filename': test_images['filename'], 'label': predictions}) # prediction is nothing but the final predictions of your model on input features of your new unseen test data
res.to_csv("submission.csv", index = False)