# BIG DATA ANALYTICS AND RNN IN BANANA PRICE FORECASTING

## STOP ANY ACTIVE SPARK SESSION

In [None]:
from pyspark.sql import SparkSession

# Stop all active Spark sessions
SparkSession.builder.getOrCreate().stop()


## Import all the necessary libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")
from pyspark.ml import Pipeline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
#List the contents of the root directory in HDFS
!hdfs dfs -ls /

## Preprocessing Data using Spark
This process involves a series of steps. They include:-
1. Loading 2 CSV files on national average prices of food data stored in hadoop.

2. Data Exploration: Perform exploratory data analysis (EDA) 

3. Data integration, merging the two csv datasets

4. Data Export: Export the preprocessed data to hadoop for storage

## Initialize Spark Session

In [None]:
spark = SparkSession.builder \
    .appName("Hadoop to python") \
    .getOrCreate()

## Step one: Load the data from hadoop using spark

In [None]:
#The two datasets hadoop path
dataset_one = "hdfs://localhost:9000/CPM04.20240327101259.csv"
dataset_two = "hdfs://localhost:9000/CPM12.20240327101308.csv"

#Create spark DataFrames for the two datasets
df_one = spark.read.csv(dataset_one, header=True, inferSchema=True)
df_two = spark.read.csv(dataset_two, header=True, inferSchema=True)

#View the two DataFRames
df_one, df_two

## Step Two: Performing Explorartory Data Analysis (EDA)

In [None]:
#View the first two observations of dataset one
df_one.head(2)

In [None]:
#View the first two observations fo dataset two 
df_two.head(2)

In [None]:
#Get the Schema for dataset one
df_one.printSchema()

## Findings
Shows there is missing data in all the variables

In [None]:
#Get the Schema for dataset two
df_two.printSchema()

## Findings
SHows there is missing data in all the variables

In [None]:
#Check the cahracteristics of the datasets
df_one.describe(), df_two.describe()

In [None]:
# Count the number of observations
dataone_count = df_one.count()

print("Number of observations:", dataone_count)

In [None]:
# Count the number of observations
datatwo_count = df_two.count()

print("Number of observations:", datatwo_count)

# Delete variables that will be unused

In [None]:
df_one.show()

In [None]:
#Drop all columns except month, consumer item and VALUE)
df_one = df_one.drop("STATISTIC","STATISTIC Label","TLIST(M1)","UNIT","C02363V02844")
df_one.show()

In [None]:
df_two.show()

In [None]:
#Drop all columns except month, consumer item and VALUE)
df_two = df_two.drop("STATISTIC","STATISTIC Label","TLIST(M1)","UNIT","C02363V03422")
df_two.show()

## Drop from consumer Item, everything except bananas

In [None]:
bananaprice_one = df_one.filter(df_one["Consumer Item"] == "Bananas per kg.")
bananaprice_two = df_two.filter(df_two["Consumer Item"] == "Bananas per kg")

In [None]:
bananaprice_one.show(), bananaprice_two.show()

## Step Three: Data Integration. Merging the two datasets into one

In [None]:
# Rename columns in df1 to match columns in df2
rename_mapping = {
    "VALUE": "National Average Price(Euros)"
    # Add more mappings as needed
}
for old_col, new_col in rename_mapping.items():
    bananaprice_one = bananaprice_one.withColumnRenamed(old_col, new_col)

In [None]:
# Rename columns in df1 to match columns in df2
rename_mapping = {
    "VALUE":"National Average Price(Euros)"
}
for old_col, new_col in rename_mapping.items():
    bananaprice_two = bananaprice_two.withColumnRenamed(old_col, new_col)

In [None]:
bananaprice_one.show(), bananaprice_two.show()

In [None]:
# Selecting columns in the same order
bananaprice_one = bananaprice_one.select("Consumer Item", "Month", "National Average Price(Euros)")
bananaprice_two = bananaprice_two.select("Consumer Item", "Month", "National Average Price(Euros)")

# Performing the union operation
merged_df = bananaprice_one.union(bananaprice_two)

# Displaying the results
merged_df.show(truncate=False)


In [None]:
merged_df.show(n=merged_df.count(), truncate=False)

In [None]:
# Count the number of observations
mergeddata_count = merged_df.count()

print("Number of observations:", mergeddata_count)

In [None]:
merged_df.describe()

In [None]:
merged_df.printSchema()

In [None]:
#Remove Missing data
# Remove rows with missing values
cleaned_df = merged_df.na.drop()

# Show the cleaned DataFrame
cleaned_df.show(n=cleaned_df.count(), truncate=False)


In [None]:
# Drop duplicates based on the date column
duplicated_df = cleaned_df.dropDuplicates(['Month'])

# Count the number of duplicates
num_duplicates = cleaned_df.count() - duplicated_df.count()

# Show the number of duplicates
print("Number of duplicates based on date:", num_duplicates)


## EXPORT THE PREPROCESSED DATA TO HADOOP FOR STORAGE

In [None]:
# Export preprocessed data
preprocessed_data_path = "hdfs://localhost:9000/preprocessed_data.csv"
cleaned_df.write.csv(preprocessed_data_path, mode="overwrite", header=True)

In [None]:
!hdfs dfs -ls /

# Convert the data to pandas

In [None]:
banana_price = cleaned_df.toPandas()