# Data Preprocessing

## Loading and Normalizing CSV Data

### Relevant Imports

In [1]:
import pandas as pd

### VIIRS Data

#### Japan

##### Light Intensity Normalization

In [2]:
# Filepath for the outputted CSV file
input_csv = "datasets/inputs/Japan_light_intensity.csv"
output_normalized_csv = "datasets/inputs/Japan_light_intensity_normalized.csv"

# Loading the extracted data
data = pd.read_csv(input_csv)

# Normalizing the light intensity column
data['normalized_light_intensity'] = (data['light_intensity'] - data['light_intensity'].min()) / \
                                     (data['light_intensity'].max() - data['light_intensity'].min())

# Save the normalized data to a new CSV
data.to_csv(output_normalized_csv, index=False)
print(f"Normalized VIIRS data saved to {output_normalized_csv}")

Normalized VIIRS data saved to datasets/inputs/Japan_light_intensity_normalized.csv


##### High-Radiance Zones Extraction

In [3]:
# Filepath for the normalized data
normalized_csv = "datasets/inputs/Japan_light_intensity_normalized.csv"
urban_output_csv = "datasets/processed/Japan_urban_light_intensity.csv"

# Loading the normalized data
normalized_data = pd.read_csv(normalized_csv).drop('light_intensity', axis=1)

# Defining a threshold for high-radiance zones
threshold = 0.3

# Extracting high-radiance zones
urban_data = normalized_data[normalized_data['normalized_light_intensity'] >= threshold]

# Saving the extracted urban areas data
urban_data.to_csv(urban_output_csv, index=False)
print(f"Urban areas data saved to {urban_output_csv}")

Urban areas data saved to datasets/processed/Japan_urban_light_intensity.csv


In [4]:
urban_data.head()

Unnamed: 0,longitude,latitude,normalized_light_intensity
4484274,141.350003,43.054166,0.353907
4484275,141.354169,43.054166,0.46118
4484276,141.358336,43.054166,0.354317
8986526,141.500003,40.5375,0.523353
8986527,141.504169,40.5375,0.654899


In [5]:
urban_data.describe()

Unnamed: 0,longitude,latitude,normalized_light_intensity
count,30.0,30.0,30.0
mean,138.226253,36.769305,0.425668
std,2.961285,3.003967,0.149639
min,132.537503,34.220833,0.300467
25%,135.501044,34.670833,0.338119
50%,139.700003,35.677083,0.375986
75%,140.940628,39.333333,0.450831
max,141.508336,43.054166,1.0


#### Philippines

##### Light Intensity Normalization

In [6]:
# Filepath for the outputted CSV file
input_csv = "datasets/inputs/Philippines_light_intensity.csv"
output_normalized_csv = "datasets/inputs/Philippines_light_intensity_normalized.csv"

# Loading the extracted data
data = pd.read_csv(input_csv)

# Normalizing the light intensity column
data['normalized_light_intensity'] = (data['light_intensity'] - data['light_intensity'].min()) / \
                                     (data['light_intensity'].max() - data['light_intensity'].min())

# Save the normalized data to a new CSV
data.to_csv(output_normalized_csv, index=False)
print(f"Normalized VIIRS data saved to {output_normalized_csv}")

Normalized VIIRS data saved to datasets/inputs/Philippines_light_intensity_normalized.csv


##### High-Radiance Zones Extraction

In [7]:
# Filepath for the normalized data
normalized_csv = "datasets/inputs/Philippines_light_intensity_normalized.csv"
urban_output_csv = "datasets/processed/Philippines_urban_light_intensity.csv"

# Loading the normalized data
normalized_data = pd.read_csv(normalized_csv).drop('light_intensity', axis=1)

# Defining a threshold for high-radiance zones
threshold = 0.3

# Extracting high-radiance zones
urban_data = normalized_data[normalized_data['normalized_light_intensity'] >= threshold]

# Saving the extracted urban areas data
urban_data.to_csv(urban_output_csv, index=False)
print(f"Urban areas data saved to {urban_output_csv}")

Urban areas data saved to datasets/processed/Philippines_urban_light_intensity.csv


In [8]:
urban_data.head()

Unnamed: 0,longitude,latitude,normalized_light_intensity
3549238,120.100002,16.125,0.361303
4522957,120.966669,14.754166,0.3546
4522958,120.970836,14.754166,0.367698
4617666,121.054169,14.620833,0.307261
4620602,120.958336,14.616666,0.513001


In [9]:
urban_data.describe()

Unnamed: 0,longitude,latitude,normalized_light_intensity
count,152.0,152.0,152.0
mean,121.17248,14.2341,0.411548
std,0.719162,1.154185,0.121967
min,120.100002,6.904166,0.300249
25%,120.983336,14.516666,0.330265
50%,121.004169,14.541666,0.366314
75%,121.050002,14.580208,0.456612
max,124.662502,16.125,1.0


#### Taiwan

##### Light Intensity Normalization

In [10]:
# Filepath for the outputted CSV file
input_csv = "datasets/inputs/Taiwan_light_intensity.csv"
output_normalized_csv = "datasets/inputs/Taiwan_light_intensity_normalized.csv"

# Loading the extracted data
data = pd.read_csv(input_csv)

# Normalizing the light intensity column
data['normalized_light_intensity'] = (data['light_intensity'] - data['light_intensity'].min()) / \
                                     (data['light_intensity'].max() - data['light_intensity'].min())

# Save the normalized data to a new CSV
data.to_csv(output_normalized_csv, index=False)
print(f"Normalized VIIRS data saved to {output_normalized_csv}")

Normalized VIIRS data saved to datasets/inputs/Taiwan_light_intensity_normalized.csv


##### High-Radiance Zones Extraction

In [11]:
# Filepath for the normalized data
normalized_csv = "datasets/inputs/Taiwan_light_intensity_normalized.csv"
urban_output_csv = "datasets/processed/Taiwan_urban_light_intensity.csv"

# Loading the normalized data
normalized_data = pd.read_csv(normalized_csv).drop('light_intensity', axis=1)

# Defining a threshold for high-radiance zones
threshold = 0.3

# Extracting high-radiance zones
urban_data = normalized_data[normalized_data['normalized_light_intensity'] >= threshold]

# Saving the extracted urban areas data
urban_data.to_csv(urban_output_csv, index=False)
print(f"Urban areas data saved to {urban_output_csv}")

Urban areas data saved to datasets/processed/Taiwan_urban_light_intensity.csv


In [12]:
urban_data.head()

Unnamed: 0,longitude,latitude,normalized_light_intensity
245570,121.750002,25.15,0.324514
256488,121.241669,25.1,0.326211
257407,121.237502,25.095833,0.386666
257408,121.241669,25.095833,0.447414
257409,121.245836,25.095833,0.320626


In [13]:
urban_data.describe()

Unnamed: 0,longitude,latitude,normalized_light_intensity
count,285.0,285.0,285.0
mean,120.575105,23.443318,0.36117
std,0.483176,0.974436,0.086443
min,120.179169,22.491666,0.300001
25%,120.225002,22.6375,0.315573
50%,120.329169,22.995833,0.335162
75%,120.937502,24.145833,0.370983
max,121.750002,25.15,1.0


#### Combining

In [14]:
# Filepaths for urban light intensity CSVs
urban_csvs = [
    "datasets/processed/Japan_urban_light_intensity.csv",
    "datasets/processed/Philippines_urban_light_intensity.csv",
    "datasets/processed/Taiwan_urban_light_intensity.csv"
]

# Output filepath for the combined CSV
combined_csv = "datasets/processed/combined_urban_light_intensity.csv"

# Initializing an empty list to store DataFrames
urban_dfs = []

# Looping through each file and load data
for csv_path in urban_csvs:
    # Extracting the country name from the file path for tagging
    country_name = csv_path.split("/")[-1].split("_")[0]
    
    # Load the CSV into a DataFrame
    df = pd.read_csv(csv_path)
    
    # Add a column for the country name
    df['country'] = country_name
    
    # Append the DataFrame to the list
    urban_dfs.append(df)

# Concatenate all the DataFrames
combined_df = pd.concat(urban_dfs, ignore_index=True)

# Save the combined DataFrame to a CSV file
combined_df.to_csv(combined_csv, index=False)
print(f"Combined urban light intensity data saved to {combined_csv}")

Combined urban light intensity data saved to datasets/processed/combined_urban_light_intensity.csv
