In [None]:
#Importing All Necesssary Packages
import pandas as pd
from sklearn.model_selection import train_test_split,cross_val_score
import requests, shutil
import os
from skimage import io
import numpy as np
from skimage.transform import resize
from PIL import Image
import cv2
from skimage import img_as_ubyte
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
new_size = (256, 256)
bin_n = 16
#Defining Functions
#Download Data & Preprpcessing Functions:
def download_prep(im_info,loc):
    #Load the data from URL, save and resize the image to specific aspect ratio. 
    #Images are saved seperately into Train and test data folders
    try:
        response=requests.get(im_info.iloc[1], stream=True)
        open('./Resized_image1/'+str(im_info.iloc[0])+'.jpg','wb').write(response.content)
        img1 = io.imread('./Resized_image1/'+str(im_info.iloc[0])+'.jpg')
        io.imsave('./Resized_image/'+str(loc)+'/'+str(im_info.iloc[0])+'.jpg',img_as_ubyte(np.array(resize(img1,new_size,mode='reflect', anti_aliasing = True,anti_aliasing_sigma=None))))
        os.remove('./Resized_image1/'+str(im_info.iloc[0])+'.jpg')
    except:
        print(im_info.iloc[0]) #Errored out image id is displayed in console plus you can check in resized_image1 folder

#Feature Extractor - HOG Extractor
def hog(img):
    gx = cv2.Sobel(img, cv2.CV_32F, 1, 0)
    gy = cv2.Sobel(img, cv2.CV_32F, 0, 1)
    mag, ang = cv2.cartToPolar(gx, gy)
    bins = np.int32(bin_n*ang/(2*np.pi))    # quantizing binvalues in (0...16)
    bin_cells = bins[:10,:10], bins[10:,:10], bins[:10,10:], bins[10:,10:]
    mag_cells = mag[:10,:10], mag[10:,:10], mag[:10,10:], mag[10:,10:]
    hists = [np.bincount(b.ravel(), m.ravel(), bin_n) for b, m in zip(bin_cells, mag_cells)]
    hist = np.hstack(hists)     # hist is a 64 bit vector
    return hist

#Load data
train  = pd.read_csv("./train.csv")
#Frequency Count of each Landmark
val = train["landmark_id"].value_counts()
#Frq = train.groupby("landmark_id").count().sort_values("id", ascending=False)
#Frq["id"].iloc[:10]
val = pd.DataFrame(val)
val["Landmark_id"] = val.index
val = val.reset_index(drop = True)
val = val.rename(columns = {"Landmark_id" : "Landmark_id","landmark_id" : "Frequency"})
#Fetching top 10 Sampled Landmark details
top_10_landmark_id = list(val.iloc[0:10,]["Landmark_id"])
top_df = pd.DataFrame()
top_df = train[train["landmark_id"].isin(top_10_landmark_id)]  
top_df = top_df.reset_index(drop = True)
#Split the dataset to Train & Test - 0.7 to 0.8 Ratio
xTrain, xTest = train_test_split(top_df, test_size = 0.3, random_state = 0)
#Download Data:
#Train Dataset:
for i in range(len(xTrain)):
    if (i % 100 == 0):
        print( i , (time.time() - start_time))
    im_info = xTrain.iloc[i] 
    loc = "Train_image"
    download_prep(im_info,loc)
#Test Dataset
for i in range(len(xTest)):
    if (i % 100 == 0):
        print( i , (time.time() - start_time))
    im_info = xTest.iloc[i] 
    loc = "Test_image"
    download_prep(im_info,loc)
#Feature Extraction for Train and test dataset
#Xtrain:
for i in range(len(xTrain)):
    im_info_train = xTrain.iloc[i]
    try:
        if os.path.exists('./Resized_image/Train_image/'+str(im_info_train.iloc[0])+'.jpg'):
            his = hog(io.imread('./Resized_image/Train_image/'+str(im_info_train.iloc[0])+'.jpg'))
            train_feature.append(his)
            train_labels.append(im_info_train.iloc[2])
    except:
        print("Train ", im_info_train.iloc[0])
#XTest
for i in range(len(xTest)):
    im_info_test = xTest.iloc[i]
    try:
        if os.path.exists('./Resized_image/Test_image/'+str(im_info_test.iloc[0])+'.jpg'):
            his = hog(io.imread('./Resized_image/Test_image/'+str(im_info_test.iloc[0])+'.jpg'))
            test_feature.append(his)
            test_labels.append(im_info_test.iloc[2])
    except:
        print("Test ",im_info_test.iloc[0])
#Converting to an Numpy Array
#Train image
train_feature_data = np.float32(train_feature)
train_label_data = np.float32(train_labels)
#Test image
test_feature_data = np.float32(test_feature)
test_label_data = np.float32(test_labels)
#Modelling
seed = 100
n_trees = 100
models = []
models.append(('LR', LogisticRegression(random_state=seed, solver='lbfgs', multi_class='multinomial',max_iter = 1000)))
models.append(('KNN', KNeighborsClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier(random_state=seed)))
models.append(('RF', RandomForestClassifier(n_estimators=n_trees, random_state=seed)))
models.append(('NB', GaussianNB()))
models.append(('SVM Non Linear', SVC(random_state=seed, kernel='rbf', max_iter = 1000, C = 0.1, gamma = 0.0001)))
models.append(('SVM Linear',SVC(kernel='linear', max_iter = 1000)))
#Cross Validation
scoring    = "accuracy"
accuracy_score = []
classifier = []
for classfr, model in models:
    kfold = KFold(n_splits=10, random_state=seed)
    cv_value = cross_val_score(model, train_feature_data,train_label_data, cv=kfold, scoring=scoring)
    results.append(cv_value)
    classifier.append(classfr)
    Score = "%s: %f" % (classfr, cv_value.mean())
    print(Score)
#Prediction on Test Set:
#Logistic Regression
mlg = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial',max_iter = 1000)
lg_clf = mlg.fit(train_feature_data,train_label_data)
pred_lg = lg_clf.predict(test_feature_data)
acc = accuracy_score(test_label_data, pred_lg) * 100
conf_matrix = confusion_matrix(test_label_data, pred_lg)
test_label_data_val = test_label_data.astype("str")
class_names = np.unique(test_label_data)
df_cm = pd.DataFrame(conf_matrix, index=class_names, columns=class_names )
print("Logistic Regression results")
print("Accuracy")
print(acc)
print("Confusion Matrix")
print(df_cm)
#SVM_Linear Modeling
svm_clf = SVC(kernel='linear', max_iter = 1000)
svm_clf.fit(train_feature_data,train_label_data)
pred_svml = svm_clf.predict(test_feature_data)
acc = accuracy_score(test_label_data, pred_svml) * 100
conf_matrix = confusion_matrix(test_label_data, pred_svml)
test_label_data_val = test_label_data.astype("str")
class_names = np.unique(test_label_data_val)
df_cm = pd.DataFrame(conf_matrix, index=class_names, columns=class_names )
print("SVM Linear Kernel results")
print("Accuracy")
print(acc)
print("Confusion Matrix")
print(df_cm)
#SVM_Non_Linear Modeling
svm_clf = SVC(kernel='rbf', max_iter = 1000, C = 0.1, gamma = 0.0001)
svm_clf.fit(train_feature_data,train_label_data)
pred_svmnl = svm_clf.predict(test_feature_data)
acc = accuracy_score(test_label_data, pred_svmnl) * 100
conf_matrix = confusion_matrix(test_label_data, pred_svmnl)
test_label_data_val = test_label_data.astype("str")
class_names = np.unique(test_label_data_val)
df_cm = pd.DataFrame(conf_matrix, index=class_names, columns=class_names )
print("SVM Non Linear Kernel results")
print("Accuracy")
print(acc)
print("Confusion Matrix")
print(df_cm)
#Random Forest Classfier
model_rf = RandomForestClassifier(n_estimators=500)
model_rf.fit(train_feature_data,train_label_data)
pred_rf = model_rf.predict(test_feature_data)
acc = accuracy_score(test_label_data, pred_rf) * 100
conf_matrix = confusion_matrix(test_label_data, pred_rf)
test_label_data_val = test_label_data.astype("str")
class_names = np.unique(test_label_data)
df_cm = pd.DataFrame(conf_matrix, index=class_names, columns=class_names )
print("Random Forest results")
print("Accuracy")
print(acc)
print("Confusion Matrix")
print(df_cm)
#Decision Tree Classifier
model_DT = DecisionTreeClassifier(random_state=seed)
model_DT.fit(train_feature_data,train_label_data)
pred_DT = model_DT.predict(test_feature_data)
acc = accuracy_score(test_label_data, pred_DT) * 100
conf_matrix = confusion_matrix(test_label_data, pred_DT)
test_label_data_val = test_label_data.astype("str")
class_names = np.unique(test_label_data)
df_cm = pd.DataFrame(conf_matrix, index=class_names, columns=class_names )
print("Decision Tree results")
print("Accuracy")
print(acc)
print("Confusion Matrix")
print(df_cm)











