In [52]:
# Required packages and libraries
import pyspark
import pyspark.pandas as ps
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from pyspark.sql.types import *
import warnings

#filter out warnings
warnings.filterwarnings("ignore")

In [37]:
# Initalize spark session
spark = SparkSession.builder \
    .appName("F-A-O") \
    .config("spark.driver.maxResultSize", "6g") \
    .config("spark.executor.memory", "10g") \
    .getOrCreate()


In [38]:
# Set the maxToStringFields property
spark.conf.set("spark.sql.debug.maxToStringFields", "100")

In [39]:
# Get the dataset
prod_data = spark\
            .read\
            .options(inferSchema="true", header="true")\
            .csv("Production_Crops_Livestock_E_Africa.csv")

There are columns that do not add direct context to my analysis. I will drop them.

In [40]:
# Drop columns

# drop flag cols
to_drop = [col for col in prod_data.columns if col.endswith("F")]

# drop unnecessary cols
to_drop1 = ['Area Code','Area Code (M49)','Item Code','Item Code (CPC)','Element Code','Unit']

# implement
prod_data = (prod_data\
            .drop(*to_drop, *to_drop1))

In [41]:
# Rename columns
#rename year cols
renamed_cols = [col.replace('Y', '') for col in prod_data.columns]
prod_data = prod_data.toDF(*renamed_cols)

#rename other cols
prod_data = prod_data\
                .withColumnRenamed("Area", "Country")


In [42]:
# Trim column names
prod_data = prod_data.select([col(name).alias(name.strip()) for name in prod_data.columns])
prod_data.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- Element: string (nullable = true)
 |-- 1961: double (nullable = true)
 |-- 1962: double (nullable = true)
 |-- 1963: double (nullable = true)
 |-- 1964: double (nullable = true)
 |-- 1965: double (nullable = true)
 |-- 1966: double (nullable = true)
 |-- 1967: double (nullable = true)
 |-- 1968: double (nullable = true)
 |-- 1969: double (nullable = true)
 |-- 1970: double (nullable = true)
 |-- 1971: double (nullable = true)
 |-- 1972: double (nullable = true)
 |-- 1973: double (nullable = true)
 |-- 1974: double (nullable = true)
 |-- 1975: double (nullable = true)
 |-- 1976: double (nullable = true)
 |-- 1977: double (nullable = true)
 |-- 1978: double (nullable = true)
 |-- 1979: double (nullable = true)
 |-- 1980: double (nullable = true)
 |-- 1981: double (nullable = true)
 |-- 1982: double (nullable = true)
 |-- 1983: double (nullable = true)
 |-- 1984: double (nullable = true)
 |-- 1985: double

In [43]:
# Melt dataframe
#convert spark to pandas df
prod_data = ps.DataFrame(prod_data)

In [44]:
#melt the pandas df
keep_columns=['Country', 'Item', 'Element']
prod_data=prod_data.melt(id_vars=keep_columns, var_name='Year',value_name='Weight')
prod_data.head()

Unnamed: 0,Country,Item,Element,Year,Weight
0,Algeria,"Almonds, in shell",Area harvested,1961,13300.0
1,Algeria,"Almonds, in shell",Area harvested,1962,13300.0
2,Algeria,"Almonds, in shell",Area harvested,1963,13300.0
3,Algeria,"Almonds, in shell",Area harvested,1964,14200.0
4,Algeria,"Almonds, in shell",Area harvested,1965,13800.0


I converted the dataframe to pandas dataframe in order to melt it. I could not get along with unpivoting the dataframe with pyspark. If you know how, kindly reach out.
Now I'll have to covert the dataframe back to spark DF for further processing.

In [45]:
# Convert pandas dataframe to RDD 
rdd = spark.sparkContext.parallelize(prod_data.values.tolist())

#Schema for the pyspark dataFrame
schema = StructType([
    StructField("Country", StringType(), nullable=False),
    StructField("Item", StringType(), nullable=False),
    StructField("Element", StringType(), nullable=False),
    StructField("Year", StringType(), nullable=False),
    StructField("Weight", DoubleType(), nullable=False)
])

#create pyspark dataFrame
prod_data = spark.createDataFrame(rdd, schema=schema)

In [53]:
prod_data = prod_data.withColumn("Category",lit(""))

# Show the updated DataFrame
prod_data.show()

+-------+-----------------+--------------+----+-------+--------+
|Country|             Item|       Element|Year| Weight|Category|
+-------+-----------------+--------------+----+-------+--------+
|Algeria|Almonds, in shell|Area harvested|1961|13300.0|        |
|Algeria|Almonds, in shell|Area harvested|1962|13300.0|        |
|Algeria|Almonds, in shell|Area harvested|1963|13300.0|        |
|Algeria|Almonds, in shell|Area harvested|1964|14200.0|        |
|Algeria|Almonds, in shell|Area harvested|1965|13800.0|        |
|Algeria|Almonds, in shell|Area harvested|1966|12700.0|        |
|Algeria|Almonds, in shell|Area harvested|1967|12900.0|        |
|Algeria|Almonds, in shell|Area harvested|1968|10000.0|        |
|Algeria|Almonds, in shell|Area harvested|1969| 5500.0|        |
|Algeria|Almonds, in shell|Area harvested|1970| 6700.0|        |
|Algeria|Almonds, in shell|Area harvested|1971| 4600.0|        |
|Algeria|Almonds, in shell|Area harvested|1972| 8300.0|        |
|Algeria|Almonds, in shel

23/06/04 16:29:28 WARN TaskSetManager: Stage 39 contains a task of very large size (3721 KiB). The maximum recommended task size is 1000 KiB.


In [None]:
# Meditation HQ
#======================================
# .withColumnRenamed("year", trim(col("year")).alias("year"))
# keep_columns = ['Country', 'Item', 'Element']
# other_columns = [col(column) for column in prod_data.columns if column not in keep_columns]
# other_df = prod_data.select(*other_columns)
# other_df.printSchema()

#===============================
# # Assuming you have a DataFrame named 'df' with the year columns and other columns
# keep_columns = ['Area', 'Item', 'Element']
# year_columns = [col for col in prod_data.columns if col not in keep_columns]

#==========================
# # Melt the year columns
# prod_data = prod_data.select(keep_columns + [
#     explode(
#         [
#             (lit(col).alias('year'), col)
#             for col in year_columns
#         ]
#     ).alias('melted')
# ]).select(keep_columns + [
#     col('melted.year'),
#     col('melted.weight')
# ])
# prod_data.show()

#=========================
# melted_df = renamed_df.selectExpr("stack(61, " + ", ".join([f"'{col}', {col}" for col in renamed_df.columns]) + ") as (year, weight)")

#====================
# from pyspark.sql.functions import expr
# melted_df = other_df.selectExpr("posexplode(array(*)) as (year, weight)").select("year", "weight")
# melted_df.show()

#===================
# cols = other_df.columns
# other_df = other_df.selectExpr("stack({},{})".format(len(cols), ','.join(("'{}'".format(i) for i in cols))))
# other_df.show()

#===================
# # Configure output partitions
# spark.conf.set("spark.sql.shuffle.partitions", "8")

#=================
# #partitionSizes = prod_data.rdd.glom().map(len).collect()
# print("Size of partitions:", partitionSizes)

#==============
# numPartitions = prod_data.rdd.getNumPartitions()
# print("Number of partitions:", numPartitions)

#===================
# repartitionedDF = prod_data.repartition(32)

#==============
# from pyspark import SparkContext
# sc = SparkContext.getOrCreate()
# executor_memory = sc.getConf().get("spark.executor.memory")
# print("Executor Memory:", executor_memory)

#=================
#prod_data.explain("formatted")

#=========================
# Set the maxPartitionBytes property
# spark.conf.set("spark.sql.files.maxPartitionBytes", "256m")

#====================
# # Get unique items in the "Name" column
# unique_names = prod_data.select("Item").distinct()
# # Show the unique names
# unique_names.show()









