In [156]:
# Required packages and libraries
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, trim, explode, lit, when
import warnings

#filter out warnings
warnings.filterwarnings("ignore")

In [124]:
# Initalize spark session
spark = SparkSession.builder.getOrCreate()

In [125]:
# Get the dataset
prod_data = (spark\
            .read\
            .option("inferSchema", "true")\
            .option("header", "true")\
            .csv("Production_Crops_Livestock_E_Africa.csv"))

In [126]:
# Configure output partitions
spark.conf.set("spark.sql.shuffle.partitions", "5")

There are columns that do not add direct context to my analysis. I will drop them.

In [128]:
# Drop columns

# drop flag cols
to_drop = [col for col in prod_data.columns if col.endswith("F")]

# drop unnecessary cols
to_drop1 = ['Area Code','Area Code (M49)','Item Code','Item Code (CPC)','Element Code','Unit']

# implement
prod_data = (prod_data\
            .drop(*to_drop, *to_drop1))

In [129]:
# Rename columns
#rename year cols
rename_cols = [col.replace('Y', '') for col in prod_data.columns]
prod_data = prod_data.toDF(*renamed_cols)

#rename other cols
prod_data = prod_data\
                .withColumnRenamed("Area", "Country")


In [151]:
# Trim column names
prod_data = prod_data.select([col(name).alias(name.strip()) for name in prod_data.columns])
prod_data.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- Element: string (nullable = true)
 |-- 1961: double (nullable = true)
 |-- 1962: double (nullable = true)
 |-- 1963: double (nullable = true)
 |-- 1964: double (nullable = true)
 |-- 1965: double (nullable = true)
 |-- 1966: double (nullable = true)
 |-- 1967: double (nullable = true)
 |-- 1968: double (nullable = true)
 |-- 1969: double (nullable = true)
 |-- 1970: double (nullable = true)
 |-- 1971: double (nullable = true)
 |-- 1972: double (nullable = true)
 |-- 1973: double (nullable = true)
 |-- 1974: double (nullable = true)
 |-- 1975: double (nullable = true)
 |-- 1976: double (nullable = true)
 |-- 1977: double (nullable = true)
 |-- 1978: double (nullable = true)
 |-- 1979: double (nullable = true)
 |-- 1980: double (nullable = true)
 |-- 1981: double (nullable = true)
 |-- 1982: double (nullable = true)
 |-- 1983: double (nullable = true)
 |-- 1984: double (nullable = true)
 |-- 1985: double

In [161]:
# Assuming you have a DataFrame named 'df' with the year columns and other columns
keep_columns = ['Area', 'Item', 'Element']
year_columns = [col for col in df.columns if col not in keep_columns]

# Melt the year columns
prod_data = prod_data.select(keep_columns + [
    explode(
        [
            (lit(col).alias('year'), col)
            for col in year_columns
        ]
    ).alias('melted')
]).select(keep_columns + [
    col('melted.year'),
    col('melted.weight')
])

prod_data.show()


TypeError: Invalid argument, not a string or column: [(Column<'Unit AS year'>, 'Unit'), (Column<'Y1961 AS year'>, 'Y1961'), (Column<'Y1962 AS year'>, 'Y1962'), (Column<'Y1963 AS year'>, 'Y1963'), (Column<'Y1964 AS year'>, 'Y1964'), (Column<'Y1965 AS year'>, 'Y1965'), (Column<'Y1966 AS year'>, 'Y1966'), (Column<'Y1967 AS year'>, 'Y1967'), (Column<'Y1968 AS year'>, 'Y1968'), (Column<'Y1969 AS year'>, 'Y1969'), (Column<'Y1970 AS year'>, 'Y1970'), (Column<'Y1971 AS year'>, 'Y1971'), (Column<'Y1972 AS year'>, 'Y1972'), (Column<'Y1973 AS year'>, 'Y1973'), (Column<'Y1974 AS year'>, 'Y1974'), (Column<'Y1975 AS year'>, 'Y1975'), (Column<'Y1976 AS year'>, 'Y1976'), (Column<'Y1977 AS year'>, 'Y1977'), (Column<'Y1978 AS year'>, 'Y1978'), (Column<'Y1979 AS year'>, 'Y1979'), (Column<'Y1980 AS year'>, 'Y1980'), (Column<'Y1981 AS year'>, 'Y1981'), (Column<'Y1982 AS year'>, 'Y1982'), (Column<'Y1983 AS year'>, 'Y1983'), (Column<'Y1984 AS year'>, 'Y1984'), (Column<'Y1985 AS year'>, 'Y1985'), (Column<'Y1986 AS year'>, 'Y1986'), (Column<'Y1987 AS year'>, 'Y1987'), (Column<'Y1988 AS year'>, 'Y1988'), (Column<'Y1989 AS year'>, 'Y1989'), (Column<'Y1990 AS year'>, 'Y1990'), (Column<'Y1991 AS year'>, 'Y1991'), (Column<'Y1992 AS year'>, 'Y1992'), (Column<'Y1993 AS year'>, 'Y1993'), (Column<'Y1994 AS year'>, 'Y1994'), (Column<'Y1995 AS year'>, 'Y1995'), (Column<'Y1996 AS year'>, 'Y1996'), (Column<'Y1997 AS year'>, 'Y1997'), (Column<'Y1998 AS year'>, 'Y1998'), (Column<'Y1999 AS year'>, 'Y1999'), (Column<'Y2000 AS year'>, 'Y2000'), (Column<'Y2001 AS year'>, 'Y2001'), (Column<'Y2002 AS year'>, 'Y2002'), (Column<'Y2003 AS year'>, 'Y2003'), (Column<'Y2004 AS year'>, 'Y2004'), (Column<'Y2005 AS year'>, 'Y2005'), (Column<'Y2006 AS year'>, 'Y2006'), (Column<'Y2007 AS year'>, 'Y2007'), (Column<'Y2008 AS year'>, 'Y2008'), (Column<'Y2009 AS year'>, 'Y2009'), (Column<'Y2010 AS year'>, 'Y2010'), (Column<'Y2011 AS year'>, 'Y2011'), (Column<'Y2012 AS year'>, 'Y2012'), (Column<'Y2013 AS year'>, 'Y2013'), (Column<'Y2014 AS year'>, 'Y2014'), (Column<'Y2015 AS year'>, 'Y2015'), (Column<'Y2016 AS year'>, 'Y2016'), (Column<'Y2017 AS year'>, 'Y2017'), (Column<'Y2018 AS year'>, 'Y2018'), (Column<'Y2019 AS year'>, 'Y2019'), (Column<'Y2020 AS year'>, 'Y2020'), (Column<'Y2021 AS year'>, 'Y2021')] of type <class 'list'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.

In [None]:
# Meditation

 # .withColumnRenamed("year", trim(col("year")).alias("year"))