<a href="https://colab.research.google.com/github/NOBODIDI/APS360_GeoGuessr_NN/blob/main/Creating_Train_Val_Test_Split.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Useful Links

* Pytorch model: https://pytorch.org/vision/stable/models/generated/torchvision.models.googlenet.html#torchvision.models.googlenet
*   https://pytorch.org/hub/pytorch_vision_googlenet/
*   Background info: https://cs231n.github.io/transfer-learning/
*   https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
* Finetuning: https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html




In [None]:
import torch
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from PIL import Image
import os
import matplotlib.pyplot as plt
import shutil
import time
import copy
import os

### Loading in Data

#### Splitting up Data into train-val-test folders (DO NOT RUN)

In [None]:
# Splitting data
# Data Loading and Splitting

train_files_list = []
val_files_list = []
test_files_list = []
files_list = []
classes = {}

dataset_dirs = [
                "/content/drive/MyDrive/GeoGuessr Train Val Test"
                ]
for old_path in dataset_dirs:
  for root, dirs, files in os.walk(old_path):
      for file in files:

          if file.endswith(".png"):
              files_list.append(os.path.join(root, file))

              file_class = (files_list[-1].split('/'))[-2]
              if (file_class not in classes):
                classes[file_class] = 1
              else:
                classes[file_class] += 1

#print images
#lets me count and print the amount of jpeg,jpg,pmg 
file_count = len(files_list)
print("We have:",file_count,"images")
classes

We have: 65198 images


{'114': 1035,
 '63': 768,
 '137': 770,
 '41': 765,
 '42': 763,
 '98': 761,
 '99': 769,
 '96': 761,
 '102': 1050,
 '86': 764,
 '169': 721,
 '97': 766,
 '83': 767,
 '61': 758,
 '71': 658,
 '87': 762,
 '66': 742,
 '58': 768,
 '130': 1042,
 '22': 754,
 '134': 1050,
 '120': 966,
 '40': 697,
 '224': 701,
 '152': 767,
 '95': 760,
 '170': 766,
 '206': 765,
 '138': 768,
 '205': 762,
 '115': 1046,
 '117': 766,
 '82': 767,
 '101': 1049,
 '149': 727,
 '76': 755,
 '59': 758,
 '65': 716,
 '153': 764,
 '135': 767,
 '100': 1049,
 '116': 1044,
 '62': 766,
 '173': 760,
 '131': 1042,
 '60': 752,
 '80': 766,
 '136': 769,
 '39': 695,
 '69': 755,
 '132': 760,
 '68': 760,
 '84': 768,
 '57': 765,
 '118': 1047,
 '75': 763,
 '155': 767,
 '93': 760,
 '64': 748,
 '81': 769,
 '187': 701,
 '119': 1050,
 '208': 759,
 '111': 1042,
 '79': 758,
 '151': 763,
 '133': 1047,
 '103': 1050,
 '112': 1040,
 '85': 769,
 '44': 765,
 '188': 759,
 '150': 736,
 '78': 759,
 '23': 766,
 '113': 1022,
 '77': 755,
 '154': 767,
 '43': 76

In [None]:
class_list = list(classes.keys())
print(len(class_list))

80


In [None]:
dataset_size = 32000
split = [0.7,0.15,0.15]  # SPLIT RATIO

train_size = int(split[0]*dataset_size)
val_size = int(split[1]*dataset_size)
test_size = dataset_size - (train_size + val_size)
print("Train size:",train_size,"\nVal size:",val_size,"\nTest size:",test_size)

# Fixed random seed for reproducible result
np.random.seed(1000)

indices = list(range(0,dataset_size))
train_indices = list(np.random.choice(indices, train_size, replace=False))
other_than_train_indices = list(set(indices) - set(train_indices))
val_indices = np.random.choice(other_than_train_indices, val_size, replace=False)
test_indices = list(set(other_than_train_indices)-set(val_indices))

Train size: 22400 
Val size: 4800 
Test size: 4800


In [None]:

split_files_list = [ [files_list[i] for i in train_indices], [files_list[i] for i in val_indices], [files_list[i] for i in test_indices] ]

destPaths = ['train','val','test']
root_path = "/content/drive/MyDrive/32k_split"

for i in range(len(destPaths)):

  destPath = os.path.join(root_path,destPaths[i])
  # if destination dir does not exists, create it
  if os.path.isdir(destPath) == False:
          os.makedirs(destPath)

  # iterate over all random files and move them
  for file in split_files_list[i]:

    file_class = (file.split('/'))[-2]
    classPath = os.path.join(destPath,file_class)
    if os.path.isdir(classPath) == False:
          os.makedirs(classPath)
    filePath = os.path.join(classPath,(file.split('/'))[-1])
    if os.path.isfile(filePath) == True:
      continue
    shutil.move(file, classPath)

In [None]:
destPaths = ['train','val','test']
limits = [280, 60, 60]
old_path = "/content/drive/MyDrive/GeoGuessr Train Val Test"
new_path = "/content/drive/MyDrive/32k_train_val_test"

i = 0

olddestPath = os.path.join(old_path,destPaths[i])
destPath = os.path.join(new_path,destPaths[i])
# if destination dir does not exists, create it
if os.path.isdir(destPath) == False:
        os.makedirs(destPath)

for j in range(len(class_list)):

    oldclassPath = os.path.join(olddestPath,class_list[j])
    classPath = os.path.join(destPath,class_list[j])
    # if destination dir does not exists, create it
    if os.path.isdir(classPath) == False:
          os.makedirs(classPath)

    k = 0
    for root, dirs, files in os.walk(oldclassPath):
        for file in files:
            if file.endswith(".png"):
              if (k >= limits[i]):
                break
              oldfile = os.path.join(oldclassPath,file)
              shutil.move(oldfile, classPath)
              k+=1
        break
                


In [None]:
i = 1

olddestPath = os.path.join(old_path,destPaths[i])
destPath = os.path.join(new_path,destPaths[i])
# if destination dir does not exists, create it
if os.path.isdir(destPath) == False:
        os.makedirs(destPath)

for j in range(len(class_list)):

    oldclassPath = os.path.join(olddestPath,class_list[j])
    classPath = os.path.join(destPath,class_list[j])
    # if destination dir does not exists, create it
    if os.path.isdir(classPath) == False:
          os.makedirs(classPath)

    k = 0
    for root, dirs, files in os.walk(oldclassPath):
        for file in files:
            if file.endswith(".png"):
              if (k >= limits[i]):
                break
              oldfile = os.path.join(oldclassPath,file)
              shutil.move(oldfile, classPath)
              k+=1
        break

In [None]:
i = 2

olddestPath = os.path.join(old_path,destPaths[i])
destPath = os.path.join(new_path,destPaths[i])
# if destination dir does not exists, create it
if os.path.isdir(destPath) == False:
        os.makedirs(destPath)

for j in range(len(class_list)):

    oldclassPath = os.path.join(olddestPath,class_list[j])
    classPath = os.path.join(destPath,class_list[j])
    # if destination dir does not exists, create it
    if os.path.isdir(classPath) == False:
          os.makedirs(classPath)

    k = 0
    for root, dirs, files in os.walk(oldclassPath):
        for file in files:
            if file.endswith(".png"):
              if (k >= limits[i]):
                break
              oldfile = os.path.join(oldclassPath,file)
              shutil.move(oldfile, classPath)
              k+=1
        break

In [None]:
val_list = []
val_classes = {}
tr_list = []
tr_classes = {}
test_list = []
test_classes = {}


dataset_dirs = ["/content/drive/MyDrive/32k_train_val_test"
                ]
for old_path in dataset_dirs:
  for root, dirs, files in os.walk(old_path):
      for file in files:
          
          if file.endswith(".png"):
              file_split = root.split('/')[-2]

              if (file_split == 'val'):
                val_list.append(os.path.join(root, file))

                file_class = (val_list[-1].split('/'))[-2]
                if (file_class not in val_classes):
                  val_classes[file_class] = 1
                else:
                  val_classes[file_class] += 1

              elif (file_split == 'train'):
                tr_list.append(os.path.join(root, file))

                file_class = (tr_list[-1].split('/'))[-2]
                if (file_class not in tr_classes):
                  tr_classes[file_class] = 1
                else:
                  tr_classes[file_class] += 1

              elif (file_split == 'test'):
                test_list.append(os.path.join(root, file))

                file_class = (test_list[-1].split('/'))[-2]
                if (file_class not in test_classes):
                  test_classes[file_class] = 1
                else:
                  test_classes[file_class] += 1

#print images
#lets me count and print the amount of jpeg,jpg,pmg 

print("We have:",len(tr_list),"train images")
display(tr_classes)
print("We have:",len(val_list),"val images")
display(val_classes)
print("We have:",len(test_list),"train images")
display(test_classes)

We have: 22400 train images


{'114': 280,
 '63': 280,
 '137': 280,
 '41': 280,
 '42': 280,
 '98': 280,
 '99': 280,
 '96': 280,
 '102': 280,
 '86': 280,
 '169': 280,
 '97': 280,
 '83': 280,
 '61': 280,
 '71': 280,
 '87': 280,
 '66': 280,
 '58': 280,
 '130': 280,
 '22': 280,
 '134': 280,
 '120': 280,
 '40': 280,
 '224': 280,
 '152': 280,
 '95': 280,
 '170': 280,
 '206': 280,
 '138': 280,
 '205': 280,
 '115': 280,
 '117': 280,
 '82': 280,
 '101': 280,
 '149': 280,
 '76': 280,
 '59': 280,
 '65': 280,
 '153': 280,
 '135': 280,
 '100': 280,
 '116': 280,
 '62': 280,
 '173': 280,
 '131': 280,
 '60': 280,
 '80': 280,
 '136': 280,
 '39': 280,
 '69': 280,
 '132': 280,
 '68': 280,
 '84': 280,
 '57': 280,
 '118': 280,
 '75': 280,
 '155': 280,
 '93': 280,
 '64': 280,
 '81': 280,
 '187': 280,
 '119': 280,
 '208': 280,
 '111': 280,
 '79': 280,
 '151': 280,
 '133': 280,
 '103': 280,
 '112': 280,
 '85': 280,
 '44': 280,
 '188': 280,
 '150': 280,
 '78': 280,
 '23': 280,
 '113': 280,
 '77': 280,
 '154': 280,
 '43': 280,
 '225': 280}

We have: 4800 val images


{'114': 60,
 '63': 60,
 '137': 60,
 '41': 60,
 '42': 60,
 '98': 60,
 '99': 60,
 '96': 60,
 '102': 60,
 '86': 60,
 '169': 60,
 '97': 60,
 '83': 60,
 '61': 60,
 '71': 60,
 '87': 60,
 '66': 60,
 '58': 60,
 '130': 60,
 '22': 60,
 '134': 60,
 '120': 60,
 '40': 60,
 '224': 60,
 '152': 60,
 '95': 60,
 '170': 60,
 '206': 60,
 '138': 60,
 '205': 60,
 '115': 60,
 '117': 60,
 '82': 60,
 '101': 60,
 '149': 60,
 '76': 60,
 '59': 60,
 '65': 60,
 '153': 60,
 '135': 60,
 '100': 60,
 '116': 60,
 '62': 60,
 '173': 60,
 '131': 60,
 '60': 60,
 '80': 60,
 '136': 60,
 '39': 60,
 '69': 60,
 '132': 60,
 '68': 60,
 '84': 60,
 '57': 60,
 '118': 60,
 '75': 60,
 '155': 60,
 '93': 60,
 '64': 60,
 '81': 60,
 '187': 60,
 '119': 60,
 '208': 60,
 '111': 60,
 '79': 60,
 '151': 60,
 '133': 60,
 '103': 60,
 '112': 60,
 '85': 60,
 '44': 60,
 '188': 60,
 '150': 60,
 '78': 60,
 '23': 60,
 '113': 60,
 '77': 60,
 '154': 60,
 '43': 60,
 '225': 60}

We have: 4800 train images


{'114': 60,
 '63': 60,
 '137': 60,
 '41': 60,
 '42': 60,
 '98': 60,
 '99': 60,
 '96': 60,
 '102': 60,
 '86': 60,
 '169': 60,
 '97': 60,
 '83': 60,
 '61': 60,
 '71': 60,
 '87': 60,
 '66': 60,
 '58': 60,
 '130': 60,
 '22': 60,
 '134': 60,
 '120': 60,
 '40': 60,
 '224': 60,
 '152': 60,
 '95': 60,
 '170': 60,
 '206': 60,
 '138': 60,
 '205': 60,
 '115': 60,
 '117': 60,
 '82': 60,
 '101': 60,
 '149': 60,
 '76': 60,
 '59': 60,
 '65': 60,
 '153': 60,
 '135': 60,
 '100': 60,
 '116': 60,
 '62': 60,
 '173': 60,
 '131': 60,
 '60': 60,
 '80': 60,
 '136': 60,
 '39': 60,
 '69': 60,
 '132': 60,
 '68': 60,
 '84': 60,
 '57': 60,
 '118': 60,
 '75': 60,
 '155': 60,
 '93': 60,
 '64': 60,
 '81': 60,
 '187': 60,
 '119': 60,
 '208': 60,
 '111': 60,
 '79': 60,
 '151': 60,
 '133': 60,
 '103': 60,
 '112': 60,
 '85': 60,
 '44': 60,
 '188': 60,
 '150': 60,
 '78': 60,
 '23': 60,
 '113': 60,
 '77': 60,
 '154': 60,
 '43': 60,
 '225': 60}