In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.sql.types import StringType
from functools import reduce

In [3]:
import pandas as pd
import glob

In [4]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CSV Merger").enableHiveSupport().getOrCreate()



# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [5]:
def create_dict_from_csv(file_path):
    df = pd.read_csv(file_path)
    df['file_name'] = df['file_name'].str[:2]
    result_dict = pd.Series(df['file_name'].values, index=df['state']).to_dict()
    return result_dict

# Example usage
file_path = r'/home/talentum/myproject/datasource/stations_info.csv'
state_dict = create_dict_from_csv(file_path)
print(state_dict)


{'Andhra Pradesh': 'AP', 'Arunachal Pradesh': 'AR', 'Assam': 'AS', 'Bihar': 'BR', 'Chhattisgarh': 'CG', 'Chandigarh': 'CH', 'Delhi': 'DL', 'Gujarat': 'GJ', 'Himachal Pradesh': 'HP', 'Haryana': 'HR', 'Jharkhand': 'JH', 'Jammu and Kashmir': 'JK', 'Karnataka': 'KA', 'Kerala': 'KL', 'Maharashtra': 'MH', 'Meghalaya': 'ML', 'Manipur': 'MN', 'Madhya Pradesh': 'MP', 'Mizoram': 'MZ', 'Nagaland': 'NL', 'Odisha': 'OR', 'Punjab': 'PB', 'Puducherry': 'PY', 'Rajasthan': 'RJ', 'Sikkim': 'SK', 'Telangana': 'TG', 'Tamil Nadu': 'TN', 'Tripura': 'TR', 'Uttarakhand': 'UK', 'Uttar Pradesh': 'UP', 'West Bengal': 'WB'}


In [6]:
print(list(state_dict.values()))

['AP', 'AR', 'AS', 'BR', 'CG', 'CH', 'DL', 'GJ', 'HP', 'HR', 'JH', 'JK', 'KA', 'KL', 'MH', 'ML', 'MN', 'MP', 'MZ', 'NL', 'OR', 'PB', 'PY', 'RJ', 'SK', 'TG', 'TN', 'TR', 'UK', 'UP', 'WB']


In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.sql.types import StringType
import os

# Initialize Spark session
spark = SparkSession.builder.appName("CSVProcessor").getOrCreate()

# Define the directory paths
directory_path = 'file:////home/talentum/myproject/datasource/archive'
savepath = 'file:////home/talentum/myproject/datasource/output'

# Define the prefixes you want to handle (assuming state_dict is defined elsewhere)
prefixes = list(state_dict.values())  # Add other prefixes as needed

# Function to normalize column names
def normalize_column_names(df):
    normalized_columns = [col.strip().replace(' ', '_').replace('.', '_').replace('(', '').replace(')', '') for col in df.columns]
    return df.toDF(*normalized_columns)

# Function to ensure correct data types
def ensure_data_types(df, columns):
    for column in columns:
        if column not in df.columns:
            df = df.withColumn(column, lit(None).cast(StringType()))
    return df.select(columns)

# Process each prefix
for prefix in prefixes:
    # Define the search pattern
    search_pattern = os.path.join(directory_path, f'{prefix}*.csv')
    
    # Read all CSV files using Spark's read method
    df = spark.read.option("header", "true").csv(search_pattern, inferSchema=True)
    
    if df.count() > 0:
        # Normalize column names
        df = normalize_column_names(df)
        
        # Identify all columns in the DataFrame
        all_columns = sorted(df.columns)  # Sort columns for consistency
        
        # Ensure DataFrame has the correct columns and data types
        df = ensure_data_types(df, all_columns)
        
        # Extract state abbreviation from file names (assuming prefix is the state abbreviation)
        state_abbr = prefix
        state_name = next((name for name, abbr in state_dict.items() if abbr == state_abbr), None)
        
        # Add the state column
        df = df.withColumn('state', lit(state_name).cast(StringType()))
        
        # Save the DataFrame to a single CSV file
        output_path = os.path.join(savepath, f'{prefix}')
        df.coalesce(1).write.option("header", "true").csv(output_path, mode='overwrite')
        print(f"Merged CSV files for prefix '{prefix}' saved as part csv file inside {prefix} folder'")
    else:
        print(f"No valid files found for prefix '{prefix}'.")

# Stop the Spark session
spark.stop()

Merged CSV files for prefix 'AP' saved as part csv file inside AP folder'
Merged CSV files for prefix 'AR' saved as part csv file inside AR folder'
Merged CSV files for prefix 'AS' saved as part csv file inside AS folder'
Merged CSV files for prefix 'BR' saved as part csv file inside BR folder'
Merged CSV files for prefix 'CG' saved as part csv file inside CG folder'
Merged CSV files for prefix 'CH' saved as part csv file inside CH folder'
Merged CSV files for prefix 'DL' saved as part csv file inside DL folder'
Merged CSV files for prefix 'GJ' saved as part csv file inside GJ folder'
Merged CSV files for prefix 'HP' saved as part csv file inside HP folder'
Merged CSV files for prefix 'HR' saved as part csv file inside HR folder'
Merged CSV files for prefix 'JH' saved as part csv file inside JH folder'
Merged CSV files for prefix 'JK' saved as part csv file inside JK folder'
Merged CSV files for prefix 'KA' saved as part csv file inside KA folder'
Merged CSV files for prefix 'KL' saved

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, mean as _mean, to_date, count, isnull
from pyspark.sql.types import DoubleType
import os

# Initialize Spark session
spark = SparkSession.builder.appName("DataCleaning").getOrCreate()

def handle_outliers(df, column):
    """Handle outliers in a specific column using the IQR method."""
    Q1 = df.approxQuantile(column, [0.25], 0.01)[0]
    Q3 = df.approxQuantile(column, [0.75], 0.01)[0]
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df.withColumn(column, when((col(column) < lower_bound) | (col(column) > upper_bound), None).otherwise(col(column)))
    median_value = df.approxQuantile(column, [0.5], 0.01)[0]
    df = df.fillna({column: median_value})
    return df

def process_csv_file(file_path, output_dir, state_code):
    """Read, clean, and save a single CSV file."""
    # Read the CSV file into a DataFrame
    df = spark.read.option("header", "true").csv(file_path, inferSchema=True)
    
    total_columns = len(df.columns)
    missing_threshold_row = total_columns * 0.85
    df = df.withColumn('missing_count', sum([isnull(col(c)).cast('int') for c in df.columns]))
    df = df.filter(col('missing_count') <= missing_threshold_row).drop('missing_count')

    # Step 2: Drop columns with more than 50% missing values (based on the new row count)
    total_rows = df.count()
    missing_threshold_col = total_rows * 0.60
    missing_value_counts = {c: df.filter(isnull(col(c))).count() for c in df.columns}
    cols_to_drop = [c for c in missing_value_counts if missing_value_counts[c] > missing_threshold_col]
    df = df.drop(*cols_to_drop)
    
    # Handle outliers and impute missing values for numeric columns
    numeric_columns = [f.name for f in df.schema.fields if isinstance(f.dataType, DoubleType)]
    for column in numeric_columns:
        df = handle_outliers(df, column)
    
    # Remove duplicates
    df = df.dropDuplicates()
    
    # Convert date columns
    if 'From Date' in df.columns and 'To Date' in df.columns:
        df = df.withColumn('From_Date', to_date(col('From Date'), 'yyyy-MM-dd'))
        df = df.withColumn('To_Date', to_date(col('To Date'), 'yyyy-MM-dd'))
        df = df.drop('From Date', 'To Date')
    
    # Normalize column names
    normalized_columns = [col.strip().replace(' ', '_').replace('.', '_').replace('(', '').replace(')', '').lower() for col in df.columns]
    df = df.toDF(*normalized_columns)
    
    # Show the DataFrame and print remaining columns
    df.show()
    print(f"Columns after cleaning: {df.columns}")
    
    # Print a red warning if the number of columns is less than 12
    if len(df.columns) < 12:
        print("\033[91mWarning: The number of columns after cleaning is less than 10.\033[0m")
    
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save the cleaned DataFrame to a single CSV file
    output_file = os.path.join(output_dir, f'{state_code}')
    df.coalesce(1).write.option("header", "true").csv(f'file://{output_file}', mode='overwrite')
    print(f'Cleaned data saved to {output_file}')

def process_all_files(state_codes, base_path, output_dir):
    """Process CSV files from multiple state codes."""
    for state_code in state_codes:
        file_path = os.path.join(base_path, state_code, '*.csv')
        process_csv_file(file_path, output_dir, state_code)

# Define state codes, base path, and output directory
state_codes = list(state_dict.values())  # Example state codes
# state_codes = ['JH']
base_path = 'file:///home/talentum/myproject/datasource/output'
output_dir = '/home/talentum/myproject/datasource/output/Cleaned'

# Process all files
process_all_files(state_codes, base_path, output_dir)

# Stop the Spark session
spark.stop()
# base_cleaned_file_path='hdfs:///user/talentum/output'

+-----------+-------+-------------+--------+-----------------+-------------------+---------------+---------+---------+--------+-------+-----------+----------+-----------+-----+-----+---------+--------+-------------+-------------------+-------------+-------+---------+------+------------+-----------------+--------------+
|at_degree_c|bp_mmhg|benzene_ug/m3|co_mg/m3|eth-benzene_ug/m3|          from_date|mp-xylene_ug/m3|nh3_ug/m3|no2_ug/m3|no_ug/m3|nox_ppb|ozone_ug/m3|pm10_ug/m3|pm2_5_ug/m3|rf_mm| rh_%|so2_ug/m3|sr_w/mt2|temp_degree_c|            to_date|toluene_ug/m3|vws_m/s|wd_degree|ws_m/s|xylene_ug/m3|             city|         state|
+-----------+-------+-------------+--------+-----------------+-------------------+---------------+---------+---------+--------+-------+-----------+----------+-----------+-----+-----+---------+--------+-------------+-------------------+-------------+-------+---------+------+------------+-----------------+--------------+
|      28.18| 750.25|          0.1|  

Cleaned data saved to /home/talentum/myproject/datasource/output/Cleaned/AR
+-----------+-------+-------------+--------+-----------------+-------------------+---------------+---------+---------+--------+-------+-----------+----------+-----------+-----+-----+---------+--------+-------------------+---------+------+--------+-----+--------------------+
|at_degree_c|bp_mmhg|benzene_ug/m3|co_mg/m3|eth-benzene_ug/m3|          from_date|mp-xylene_ug/m3|nh3_ug/m3|no2_ug/m3|no_ug/m3|nox_ppb|ozone_ug/m3|pm10_ug/m3|pm2_5_ug/m3|rf_mm| rh_%|so2_ug/m3|sr_w/mt2|            to_date|wd_degree|ws_m/s|    city|state|    station_location|
+-----------+-------+-------------+--------+-----------------+-------------------+---------------+---------+---------+--------+-------+-----------+----------+-----------+-----+-----+---------+--------+-------------------+---------+------+--------+-----+--------------------+
|      24.18|1002.72|         1.24|    0.53|             0.45|2019-03-09 17:00:00|           0.67| 

KeyboardInterrupt: 

In [10]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder.appName("Merge CSV to Parquet").getOrCreate()

directories = [
    "file:///home/talentum/myproject/datasource/output/Cleaned/AP/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/AR/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/AS/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/BR/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/CG/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/CH/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/DL/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/GJ/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/HP/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/HR/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/JH/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/JK/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/KA/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/KL/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/MH/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/ML/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/MN/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/MP/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/MZ/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/NL/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/OR/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/PB/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/PY/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/RJ/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/SK/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/TG/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/TN/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/TR/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/UK/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/UP/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/WB/*.csv"
]
# Define the list of directories

# Initialize an empty DataFrame
merged_df = None

# Loop through the directories and read CSV files
for directory in directories:
    print(f"Reading files from: {directory}")
    df = spark.read.csv(directory, header=True, inferSchema=True)
    
    # If merged_df is None, set it to the current df, otherwise union it with the existing merged_df
    if merged_df is None:
        merged_df = df
    else:
        merged_df = merged_df.unionByName(df)

print("Finished merging all CSV files.")

# Save the merged DataFrame as a single Parquet file
output_path = "file:///home/talentum/myproject/dataSource/output/india_region.parquet"
merged_df.write.mode("overwrite").parquet(output_path)

print(f"Finished saving the merged DataFrame as Parquet at {output_path}")

# Stop the Spark session
spark.stop()
print("Spark session stopped.")


Reading files from: file:///home/talentum/myproject/datasource/output/Cleaned/AP/*.csv
Reading files from: file:///home/talentum/myproject/datasource/output/Cleaned/AR/*.csv


AnalysisException: 'Cannot resolve column name "temp_degree_c" among (at_degree_c, bp_mmhg, benzene_ug/m3, co_mg/m3, eth-benzene_ug/m3, from_date, mp-xylene_ug/m3, nh3_ug/m3, no2_ug/m3, no_ug/m3, nox_ppb, ozone_ug/m3, pm10_ug/m3, pm2_5_ug/m3, rf_mm, rh_%, so2_ug/m3, sr_w/mt2, to_date, wd_degree, ws_m/s, state);'

In [11]:
print(df.columns)


['at_degree_c', 'bp_mmhg', 'benzene_ug/m3', 'co_mg/m3', 'eth-benzene_ug/m3', 'from_date', 'mp-xylene_ug/m3', 'nh3_ug/m3', 'no2_ug/m3', 'no_ug/m3', 'nox_ppb', 'ozone_ug/m3', 'pm10_ug/m3', 'pm2_5_ug/m3', 'rf_mm', 'rh_%', 'so2_ug/m3', 'sr_w/mt2', 'to_date', 'wd_degree', 'ws_m/s', 'state']


In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit

# Initialize a Spark session
spark = SparkSession.builder.appName("CSV Merge").getOrCreate()

# List of directories (replace with your actual list)
directories = [
    "file:///home/talentum/myproject/datasource/output/Cleaned/AP/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/AR/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/AS/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/BR/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/CG/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/CH/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/DL/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/GJ/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/HP/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/HR/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/JH/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/JK/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/KA/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/KL/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/MH/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/ML/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/MN/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/MP/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/MZ/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/NL/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/OR/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/PB/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/PY/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/RJ/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/SK/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/TG/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/TN/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/TR/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/UK/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/UP/*.csv",
    "file:///home/talentum/myproject/datasource/output/Cleaned/WB/*.csv"
]


dfs = []
all_columns = set()

print("Starting to read CSV files...")

# Read the CSV files and update the list of all columns
for directory in directories:
    print(f"Reading files from: {directory}")
    df = spark.read.csv(directory, header=True, inferSchema=True)
    print(f"Read {df.count()} rows from: {directory}")
    dfs.append(df)
    all_columns.update(df.columns)

print("Finished reading CSV files.")
print("Starting to clean column names...")

# Clean column names
def clean_column_names(df):
    for col_name in df.columns:
        new_col_name = col_name.replace('-', '_').replace('/', '_').replace(' ', '_')
        df = df.withColumnRenamed(col_name, new_col_name)
    return df

dfs = [clean_column_names(df) for df in dfs]
all_columns = {col_name.replace('-', '_').replace('/', '_').replace(' ', '_') for col_name in all_columns}

print("Finished cleaning column names.")
print("Starting to ensure all DataFrames have the same columns...")

# Ensure all DataFrames have all columns
def add_missing_columns(df, all_columns):
    for column in all_columns:
        if column not in df.columns:
            df = df.withColumn(column, lit(None))
    return df

dfs = [add_missing_columns(df, all_columns) for df in dfs]

print("Finished ensuring all DataFrames have the same columns.")
print("Starting to select columns in the same order for consistency...")

# Select the columns in the same order for consistency
dfs = [df.select(*sorted(all_columns)) for df in dfs]

print("Finished selecting columns in the same order.")
print("Starting to merge DataFrames...")

# Merge the DataFrames
merged_df = dfs[0]
for df in dfs[1:]:
    merged_df = merged_df.unionByName(df)

print("Finished merging DataFrames.")
print("Starting to save the merged DataFrame to a Parquet file...")

# Coalesce to a single partition and save the merged DataFrame to a Parquet file
merged_df.coalesce(1).write.parquet("file:///home/talentum/myproject/datasource/output/merged_final")

print("Finished saving the merged DataFrame as Parquet.")
print("Stopping the Spark session...")

# Stop the Spark session
spark.stop()

print("Spark session stopped.")


Starting to read CSV files...
Reading files from: file:///home/talentum/myproject/datasource/output/Cleaned/AP/*.csv
Read 225716 rows from: file:///home/talentum/myproject/datasource/output/Cleaned/AP/*.csv
Reading files from: file:///home/talentum/myproject/datasource/output/Cleaned/AR/*.csv
Read 8131 rows from: file:///home/talentum/myproject/datasource/output/Cleaned/AR/*.csv
Reading files from: file:///home/talentum/myproject/datasource/output/Cleaned/AS/*.csv
Read 73172 rows from: file:///home/talentum/myproject/datasource/output/Cleaned/AS/*.csv
Reading files from: file:///home/talentum/myproject/datasource/output/Cleaned/BR/*.csv
Read 636501 rows from: file:///home/talentum/myproject/datasource/output/Cleaned/BR/*.csv
Reading files from: file:///home/talentum/myproject/datasource/output/Cleaned/CG/*.csv
Read 49048 rows from: file:///home/talentum/myproject/datasource/output/Cleaned/CG/*.csv
Reading files from: file:///home/talentum/myproject/datasource/output/Cleaned/CH/*.csv
Re

In [13]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder.appName("Parquet to CSV").getOrCreate()

# Read the Parquet file
parquet_file_path = "file:///home/talentum/myproject/dataSource/output/merged_final"
df = spark.read.parquet(parquet_file_path)

# Define the output directory for the CSV file
output_csv_path = "file:///home/talentum/myproject/dataSource/output/merged_final_csv"

# Save the DataFrame as a CSV file
df.coalesce(1).write.csv(output_csv_path, header=True)

# Stop the Spark session
spark.stop()


In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean, count, when

# Initialize a Spark session
spark = SparkSession.builder.appName("CSV Analysis").getOrCreate()

# Read the CSV file
csv_file_path = "file:///home/talentum/myproject/dataSource/output/merged_final_csv/final.csv"
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Show the first 20 rows of the DataFrame
print("Showing the first 20 rows:")
df.show(20)

# Display the columns in the DataFrame
print("Columns in the DataFrame:")
print(df.columns)

# Count the total number of rows in the DataFrame
total_rows = df.count()
print(f"Total number of rows: {total_rows}")

# Count the number of null values in each column
print("Number of null values in each column:")
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

# Calculate the mean of each numeric column
print("Mean of each numeric column:")
numeric_columns = [c for c, t in df.dtypes if t in ['int', 'double', 'float']]
df.select([mean(col(c)).alias(c) for c in numeric_columns]).show()

# Stop the Spark session
spark.stop()


Showing the first 20 rows:
+---------+-----------+-------------+-------+--------+-----------------+----------------+---------------+---------+---------+--------+-------+---------+--------------+-----------+----------+-----------+-----+-----+---------+---------+--------+--------------+-----+-------------+----------------+-------------+-------+------+---------+------+------------+
|at_degree|at_degree_c|benzene_ug_m3|bp_mmhg|co_mg_m3|eth_benzene_ug_m3|       from_date|mp_xylene_ug_m3|nh3_ug_m3|no2_ug_m3|no_ug_m3|nox_ppb|nox_ug_m3|o_xylene_ug_m3|ozone_ug_m3|pm10_ug_m3|pm2_5_ug_m3|rf_mm| rh_%|rh_degree|so2_ug_m3|sr_w_mt2|         state|temp_|temp_degree_c|         to_date|toluene_ug_m3|vws_m_s|wd_deg|wd_degree|ws_m_s|xylene_ug_m3|
+---------+-----------+-------------+-------+--------+-----------------+----------------+---------------+---------+---------+--------+-------+---------+--------------+-----------+----------+-----------+-----+-----+---------+---------+--------+--------------+-----

Total number of rows: 12119438
Number of null values in each column:
+---------+-----------+-------------+-------+--------+-----------------+---------+---------------+---------+---------+--------+-------+---------+--------------+-----------+----------+-----------+--------+------+---------+---------+--------+-----+--------+-------------+-------+-------------+-------+-------+---------+------+------------+
|at_degree|at_degree_c|benzene_ug_m3|bp_mmhg|co_mg_m3|eth_benzene_ug_m3|from_date|mp_xylene_ug_m3|nh3_ug_m3|no2_ug_m3|no_ug_m3|nox_ppb|nox_ug_m3|o_xylene_ug_m3|ozone_ug_m3|pm10_ug_m3|pm2_5_ug_m3|   rf_mm|  rh_%|rh_degree|so2_ug_m3|sr_w_mt2|state|   temp_|temp_degree_c|to_date|toluene_ug_m3|vws_m_s| wd_deg|wd_degree|ws_m_s|xylene_ug_m3|
+---------+-----------+-------------+-------+--------+-----------------+---------+---------------+---------+---------+--------+-------+---------+--------------+-----------+----------+-----------+--------+------+---------+---------+--------+-----+--------+

In [15]:
print(f"Number of columns: {len(df.columns)}, Column names: {df.columns}")


Number of columns: 32, Column names: ['at_degree', 'at_degree_c', 'benzene_ug_m3', 'bp_mmhg', 'co_mg_m3', 'eth_benzene_ug_m3', 'from_date', 'mp_xylene_ug_m3', 'nh3_ug_m3', 'no2_ug_m3', 'no_ug_m3', 'nox_ppb', 'nox_ug_m3', 'o_xylene_ug_m3', 'ozone_ug_m3', 'pm10_ug_m3', 'pm2_5_ug_m3', 'rf_mm', 'rh_%', 'rh_degree', 'so2_ug_m3', 'sr_w_mt2', 'state', 'temp_', 'temp_degree_c', 'to_date', 'toluene_ug_m3', 'vws_m_s', 'wd_deg', 'wd_degree', 'ws_m_s', 'xylene_ug_m3']


In [16]:
df.select("at_degree").show(10)


AttributeError: 'NoneType' object has no attribute '_jvm'

In [17]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder.appName("CSV Analysis").getOrCreate()

# Read the CSV file
csv_file_path = "file:///home/talentum/myproject/dataSource/output/merged_final_csv/final.csv"
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Check if Spark session is active
print(spark)


<pyspark.sql.session.SparkSession object at 0x7ff72ec03470>


In [18]:
df.select("at_degree").show(10)


+---------+
|at_degree|
+---------+
|     null|
|     null|
|     null|
|     null|
|     null|
|     null|
|     null|
|     null|
|     null|
|     null|
+---------+
only showing top 10 rows



In [19]:
from pyspark.sql.functions import col, isnan, when, count

# Count the number of null values in the 'at_degree' column
null_count = df.select(count(when(col("at_degree").isNull(), "at_degree")).alias("null_count")).collect()[0]["null_count"]

# Count the number of non-null values in the 'at_degree' column
non_null_count = df.select(count(col("at_degree")).alias("non_null_count")).collect()[0]["non_null_count"]

print(f"Number of null values in 'at_degree': {null_count}")
print(f"Number of non-null values in 'at_degree': {non_null_count}")


Number of null values in 'at_degree': 12108878
Number of non-null values in 'at_degree': 10560


In [20]:
df.select("at_degree_c").show(10)

+-----------+
|at_degree_c|
+-----------+
|      21.47|
|      27.97|
|      22.37|
|      26.47|
|       24.0|
|      23.37|
|       22.0|
|      22.27|
|      23.05|
|       21.6|
+-----------+
only showing top 10 rows



In [21]:
null_count = df.select(count(when(col("at_degree").isNull(), "at_degree_c")).alias("null_count")).collect()[0]["null_count"]

# Count the number of non-null values in the 'at_degree' column
non_null_count = df.select(count(col("at_degree")).alias("non_null_count")).collect()[0]["non_null_count"]

print(f"Number of null values in 'at_degree': {null_count}")
print(f"Number of non-null values in 'at_degree': {non_null_count}")

Number of null values in 'at_degree': 12108878
Number of non-null values in 'at_degree': 10560
