In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score, confusion_matrix
import tqdm
import pickle

In [3]:
source_path = '/content/drive/My Drive/Pattern Recognition/project/'
os.chdir(source_path)
os.listdir()

['pb_train', 'pb_test', 'yt_test', 'abc2.csv']

In [4]:
pb_train_path = source_path + 'pb_train/'
pb_test_path = source_path + 'pb_test/'
yt_test_path = source_path + 'yt_test/'

# Get all the training/test csv file names
language_file_names = [f for f in os.listdir(pb_train_path) if f.endswith('.csv')]
print(language_file_names)

['odi_combined.csv', 'kan_combined.csv', 'mar_combined.csv', 'hin_combined.csv', 'asm_combined.csv', 'mal_combined.csv', 'pun_combined.csv', 'tam_combined.csv', 'tel_combined.csv', 'eng_combined.csv', 'ben_combined.csv', 'guj_combined.csv']


In [5]:
languages = [f[:3] for f in language_file_names] ## Names of all the language classes
print(languages)

['odi', 'kan', 'mar', 'hin', 'asm', 'mal', 'pun', 'tam', 'tel', 'eng', 'ben', 'guj']


## Loading Prasar Bharti Training, Test as well as Youtube Test datasets

In [6]:
# training dataset
os.chdir(pb_train_path)
train_df_list = [pd.read_csv(f,header=None,encoding='UTF-16') for f in language_file_names] # list of dataframes of each language
os.chdir(source_path)

# test dataset
os.chdir(pb_test_path)
pb_test_df_list = [pd.read_csv(f,header=None,encoding='UTF-16') for f in language_file_names] # list of dataframes of each language
os.chdir(source_path)

# Yt test dataset
os.chdir(yt_test_path)
yt_test_df_list = [pd.read_csv(f,header=None,encoding='UTF-16') for f in language_file_names] # list of dataframes of each language
os.chdir(source_path)

---
### <center> Details About Training Dataset

---

In [7]:
from tabulate import tabulate

In [9]:
lang_num_samples = []
total_samples = 0
head = ['languages', 'num_samples']
for i, df in enumerate(train_df_list):
  total_samples += len(df)
  lang_num_samples.append([languages[i], len(df)])

print(tabulate(lang_num_samples, headers=head, tablefmt="grid"))
print('#training samples: ', total_samples)

+-------------+---------------+
| languages   |   num_samples |
| odi         |         88036 |
+-------------+---------------+
| kan         |         84020 |
+-------------+---------------+
| mar         |         78840 |
+-------------+---------------+
| hin         |         85886 |
+-------------+---------------+
| asm         |         84631 |
+-------------+---------------+
| mal         |         78711 |
+-------------+---------------+
| pun         |         93896 |
+-------------+---------------+
| tam         |         80384 |
+-------------+---------------+
| tel         |         93457 |
+-------------+---------------+
| eng         |         85072 |
+-------------+---------------+
| ben         |         92060 |
+-------------+---------------+
| guj         |         91808 |
+-------------+---------------+
#training samples:  1036801


## Preprocessing 

- Let's look at some of the examples of training dataset
- consider language ```odi```

In [10]:
train_df_list[0].head() 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,38
0,-297.55453,114.975655,-26.874882,14.372772,-26.076553,-40.49436,-39.15238,-0.148832,10.557054,4.27341,...,-2.283444,2.129192,1.463246,-0.808631,-0.406028,0.269205,2.194356,-0.669833,1.686981,0.444369
1,-290.50305,119.28168,-27.311127,18.129154,-31.800385,-21.114952,-32.92627,9.195437,10.816742,-6.464131,...,-2.283444,2.129192,1.463246,-0.808631,-0.406028,0.269205,2.194356,-0.669833,1.686981,0.444369
2,-298.6164,124.324005,-26.60351,24.747494,-24.23079,-26.780413,-41.87477,6.689118,5.113431,-0.64437,...,-1.667311,3.237787,2.465588,-0.824509,-0.829416,0.522061,1.11203,0.482328,0.790595,-0.696262
3,-402.34326,171.12654,31.846004,-14.954325,-6.754307,-7.309353,-55.213844,-31.197144,-4.440442,26.303654,...,4.144033,-0.877308,-0.882621,3.369607,2.668791,-3.210168,-3.715754,0.238617,0.100067,0.309535
4,-382.10928,175.3539,25.970688,-30.69917,-15.175529,3.797381,-51.171974,-28.954378,-12.154978,18.99998,...,5.125,1.235222,-0.572539,2.583407,2.551187,-0.468466,-2.867888,-1.104944,-2.074763,1.236288


In [11]:
train_df_list[0].describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,38
count,88036.0,88036.0,88036.0,88036.0,88036.0,88036.0,88036.0,88036.0,88036.0,88036.0,...,88036.0,88036.0,88036.0,88036.0,88036.0,88036.0,88036.0,88036.0,88036.0,88036.0
mean,-336.802237,110.569523,0.648722,18.126663,-34.445964,-16.827006,-40.340815,-4.296318,-21.604151,-0.662465,...,0.204679,1.160468,0.6253,0.95024,0.130448,0.225098,-0.097968,0.115687,0.030938,0.178585
std,74.011597,36.701967,37.440787,29.241462,24.387689,19.917631,19.626883,19.098677,17.600415,15.227446,...,2.637407,2.475501,2.013369,2.079118,1.818899,1.654506,1.662273,1.456728,1.45058,1.409412
min,-763.4072,-223.29184,-154.05838,-111.729324,-137.28125,-100.64973,-131.8005,-96.42398,-101.25087,-73.910355,...,-13.872405,-13.755872,-8.915551,-7.646359,-7.844583,-8.154106,-7.381017,-6.498169,-8.884719,-10.173363
25%,-389.653262,84.838176,-24.214925,-0.905258,-50.707831,-30.501994,-53.392251,-17.093113,-33.548847,-10.563008,...,-1.517693,-0.49079,-0.713925,-0.440493,-1.069407,-0.858442,-1.202904,-0.835883,-0.917234,-0.713255
50%,-342.06876,110.175127,-0.140745,18.151736,-33.043792,-17.011038,-40.247804,-4.124035,-22.318228,-0.429088,...,0.142639,1.099458,0.611433,0.886138,0.11301,0.236401,-0.118463,0.123209,0.019214,0.186547
75%,-285.552748,136.231288,24.71449,37.990021,-17.376864,-3.420218,-27.387507,8.655362,-10.411426,9.449225,...,1.860272,2.747779,1.950679,2.295154,1.320319,1.322103,0.991329,1.081541,0.9719,1.08726
max,-80.079445,267.5019,182.22916,131.3977,78.83629,107.66381,59.97956,74.45164,59.284264,79.048706,...,14.821094,15.294741,9.728811,10.83554,10.259733,7.764557,9.075257,6.921994,6.695831,7.275021


- ▶ We can see that the range of sample values in each feature is different and their standard deviations are also large.
- This might limit our model performance and accuracy on test data samples.
- There are two choices for handling this issue:
  1.   ```Min-Max Normalisation```
  2.   ```Z-score Normalisation```

- However, experimentally (for this case) we are getting higher classification accuracy when the datasets are normalised with ```Min-Max Normalisation```. So we'll be going with option (1) only.





In [12]:
## normalise the data samples in range [0, 1]
# before feeding to model
train_min_max_list = [[df.min(), df.max()] for df in train_df_list] # storing min and max value for training data samples, for all the features

def normalise_df(df, minimums, maximums):
  normalised_ = (df - minimums)/(maximums - minimums)
  return normalised_

In [13]:
# Prasar Bharti Training Dataset
normalised_train_df_list = [normalise_df(train_df_list[i], train_min_max_list[i][0], train_min_max_list[i][1]) for i in range(len(train_df_list))]

# Prasar Bharti Test Dataset
normalised_pb_test_df_list = [normalise_df(pb_test_df_list[i], train_min_max_list[i][0], train_min_max_list[i][1]) for i in range(len(pb_test_df_list))]

# YouTube Test Dataset
normalised_yt_test_df_list = [normalise_df(yt_test_df_list[i], train_min_max_list[i][0], train_min_max_list[i][1]) for i in range(len(yt_test_df_list))]

In [14]:
# Let's look at the some of the examples of our normalised data
normalised_train_df_list[0].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,38
0,0.681741,0.689225,0.378199,0.518668,0.514557,0.288773,0.483096,0.563422,0.69647,0.511142,...,0.403888,0.546806,0.556672,0.369969,0.410872,0.529147,0.581868,0.434297,0.678519,0.608522
1,0.692061,0.697999,0.376901,0.534118,0.488072,0.381803,0.515561,0.618107,0.698088,0.440943,...,0.403888,0.546806,0.556672,0.369969,0.410872,0.529147,0.581868,0.434297,0.678519,0.608522
2,0.680187,0.708273,0.379006,0.56134,0.523097,0.354606,0.4689,0.603439,0.662561,0.478991,...,0.425361,0.584967,0.610433,0.36911,0.387486,0.545031,0.516098,0.52015,0.620987,0.543151
3,0.528391,0.803634,0.552814,0.398043,0.603963,0.448076,0.399346,0.381721,0.603048,0.655169,...,0.627893,0.443315,0.43085,0.596041,0.580711,0.310575,0.222727,0.50199,0.576667,0.600795
4,0.558002,0.812247,0.535343,0.333283,0.564997,0.501394,0.420422,0.394846,0.554993,0.60742,...,0.66208,0.516034,0.447482,0.553502,0.574215,0.482807,0.27425,0.401875,0.437081,0.653909


Now since the range of each features will not affect our model performance much as they have same range (in case of training dataset atleast) 

## 1. GMM 
### Model Training/Building and Saving


> We'll be making GMM with both ```diagonal``` as well as ```full``` covariance and at the end we'll compare their results w.r.t number of clustors




