In [3]:
import pandas as pd
import numpy as np
import zipfile
import gdown
import os

# Descarga

In [2]:
data_url = 'https://drive.google.com/uc?id=1qB5dPWZMi2-12sLHDykHb9i6GibbJ46l&confirm=t&uuid=7b142ec9-01d0-4f33-b79f-a1088a266174'
data_dir = '../TFM'
zip_file = os.path.join('../TFM','SolarPanelSoilingImageDataset.zip')

if not os.path.isfile('SolarPanelSoilingImageDataset.zip'):
    gdown.download(data_url, quiet=False) 

# Descompresión

In [3]:
# Descomprimir el archivo
with zipfile.ZipFile(zip_file, 'r') as zip_object:
    zip_object.extractall(path=data_dir)

# Generación y comprobación del dataframe con las etiquetas de las imagenes

In [4]:
# Ruta al directorio que contiene las imágenes
data_dir = '../TFM/Solar_Panel_Soiling_Image_dataset/PanelImages'

# Obtén la lista de archivos en el directorio
file_list = os.listdir(data_dir)

# Crear un DataFrame para almacenar información del nombre del archivo
df = pd.DataFrame(file_list, columns=['filename'])

# Visualizar las primeras filas del DataFrame
print(df.head())

# Visualizar información estadística sobre el DataFrame
print(df.describe())

                                            filename
0  solar_Fri_Jun_16_10__0__11_2017_L_0.9061532083...
1  solar_Fri_Jun_16_10__0__16_2017_L_0.9030816970...
2  solar_Fri_Jun_16_10__0__1_2017_L_0.91669804403...
3  solar_Fri_Jun_16_10__0__21_2017_L_0.9030816970...
4  solar_Fri_Jun_16_10__0__26_2017_L_0.8960873911...
                                                 filename
count                                               45754
unique                                              45754
top     solar_Fri_Jun_16_10__0__11_2017_L_0.9061532083...
freq                                                    1


# Procesado de los datos de las etiquetas

In [7]:
# Comprobar formato del label
print(df['filename'][0])

solar_Fri_Jun_16_10__0__11_2017_L_0.906153208302_I_0.321592156863.jpg


In [8]:
# Dividir label por "_" y comprobar
df['filename'][0].split("_")

['solar',
 'Fri',
 'Jun',
 '16',
 '10',
 '',
 '0',
 '',
 '11',
 '2017',
 'L',
 '0.906153208302',
 'I',
 '0.321592156863.jpg']

In [21]:
# Guardar datos
solar = df['filename'].apply(lambda x: x.split("_")[0])
day_of_week = df['filename'].apply(lambda x: x.split("_")[1])
month = df['filename'].apply(lambda x: x.split("_")[2])
day = df['filename'].apply(lambda x: x.split("_")[3])
hour = df['filename'].apply(lambda x: x.split("_")[4])
time_log = df['filename'].apply(lambda x: x.split("_")[4] + ":" +
                            x.split("_")[6] + ":" + x.split("_")[8])
minutes = df['filename'].apply(lambda x: x.split("_")[6])
seconds = df['filename'].apply(lambda x: x.split("_")[8])
year = df['filename'].apply(lambda x: x.split("_")[9])
loss = df['filename'].apply(lambda x: x.split("_")[11])
irradiation = df['filename'].apply(lambda x: x.split("_")[13].replace(".jpg", ""))

# Añadir las listas al dataframe
df['solar']       = solar
df['day_of_week'] = day_of_week
df['month']       = month
df['day']         = day
df['hour']        = hour
df['min']         = minutes
df['sec']         = seconds
df['time']        = df['hour'].astype('int')*3600 + df['min'].astype('int')*60 + df['sec'].astype('int')
df['year']        = year
df['date']        = year + "-" + month + "-" + day + " " + time_log
df['loss']        = loss
df['irradiation'] = irradiation

In [25]:
df = df.sort_values(by='date')

In [26]:
df.head()

Unnamed: 0,filename,solar,day_of_week,month,day,hour,min,sec,time,year,date,loss,irradiation
25126,solar_Tue_Jun_13_10__0__12_2017_L_0.1507811777...,solar,Tue,Jun,13,10,0,12,36012,2017,2017-Jun-13 10:0:12,0.150781177776,0.169678431373
25127,solar_Tue_Jun_13_10__0__18_2017_L_0.1507811777...,solar,Tue,Jun,13,10,0,18,36018,2017,2017-Jun-13 10:0:18,0.150781177776,0.169678431373
25130,solar_Tue_Jun_13_10__0__2_2017_L_0.14720963921...,solar,Tue,Jun,13,10,0,2,36002,2017,2017-Jun-13 10:0:2,0.147209639219,0.170545098039
25128,solar_Tue_Jun_13_10__0__23_2017_L_0.1544552142...,solar,Tue,Jun,13,10,0,23,36023,2017,2017-Jun-13 10:0:23,0.154455214294,0.167901960784
25129,solar_Tue_Jun_13_10__0__28_2017_L_0.1544552142...,solar,Tue,Jun,13,10,0,28,36028,2017,2017-Jun-13 10:0:28,0.154455214294,0.167901960784


# Conversión en csv del dataframe

In [27]:
df.to_csv('image_labels.csv', encoding='utf-8', index=False)