<a href="https://colab.research.google.com/github/Keizerbub/weather/blob/main/wrangling_weather.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import framework

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=1ed6168c3d83aafdb188df4265403a5a1588c1b8b71cc938f1b13d601bc299d7
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
import pandas as pd
import os
import gzip
import shutil
from pyspark.sql import SparkSession

#ouverture des fichiers

In [6]:
"""ouverture des fichiers"""
class wrangling:
    def __init__(self):
        self.spark = SparkSession.builder \
            .appName("HandlingFile") \
            .getOrCreate()



    def open_file(self, file_path):
        try:
            # Read the file into a DataFrame
            self.df = self.spark.read.csv(file_path, header=True, inferSchema=True, sep=";")  # Example for reading a CSV file
            # If it's a different type of file, use the appropriate method like `spark.read.json()`, etc.

            # Show the DataFrame content
            self.df.show()

        except Exception as e:
            print("Error occurred:", e)



    def close_spark(self):
        # Stop the SparkSession
        self.spark.stop()



    def select_columns(self, *columns, columns_state=True):
        if columns_state==False:
          columns='LAT','LON','AAAAMMJJHH','RR1','FF','TN50'

        try:
            # Select only the desired columns
            self.df = self.df.select(*columns)

            # Show the DataFrame content after selecting columns
            print("DataFrame after selecting columns:")
            self.df.show()

        except Exception as e:
            print("Error occurred:", e)



    def check_column_null(self, column_name):
        try:
            # Check if the column has any null values
            null_count = self.df.filter(self.df[column_name].isNull()).count()

            # Calculate the percentage of null values
            total_rows = self.df.count()
            self.null_percentage = (null_count / total_rows) * 100


            if null_count > 0:
                print(f"Column '{column_name}' has {null_count} null values for {self.null_percentage}.\n")
            else:
                print(f"Column '{column_name}' has no null values.\n")


        except Exception as e:
            print("Error occurred:", e)



    def check_all_columns_null(self):
      try:
          # Get all column names
          columns = self.df.columns

          # Apply check_column_null to each column
          for col in columns:
              self.check_column_null(col)
              #removing the missing col
              if self.null_percentage==100.0:
                  self.df = self.df.drop(col)
                  print(f'the column {col} was delete')

      except Exception as e:
          print("Error occurred:", e)

In [7]:
data=wrangling()

In [8]:
data.open_file(file_path="/content/H_75__combined.csv")

+-----------+---------------+---------+--------+----+----------+---+----+----+-----+---+---+---+---+----+----+----+----+----+----+----+----+---+----+----+----+----+----+----+----+----+-----+----+-----+----+-----+-----+------+-----+------+------+-------+----+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-----+-----+------+----+-----+---------+----------+-------+-------+---+---+----+----+----+----+----+----+----+----+-------+--------+-------+--------+----+----+----+-----+-----+------+-------+-------+----+-----+----+----+----+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+------+-------+----+----+----+----+----+----+----+----+-----+------+----+-----+-----+------+-------+--------+--------+---------+------+-------+------+-------+-------+--------+--------+---------+-------+--------+----------+----------+---------+----------+

In [9]:
data.select_columns(columns_state=False)

DataFrame after selecting columns:
+---------+--------+----------+---+---+----+
|      LAT|     LON|AAAAMMJJHH|RR1| FF|TN50|
+---------+--------+----------+---+---+----+
|48.858333|2.348333|2000010100|0.0|1.0|NULL|
|48.858333|2.348333|2000010101|0.0|1.0|NULL|
|48.858333|2.348333|2000010102|0.0|1.0|NULL|
|48.858333|2.348333|2000010103|0.0|4.0|NULL|
|48.858333|2.348333|2000010104|0.0|3.0|NULL|
|48.858333|2.348333|2000010105|0.2|2.0|NULL|
|48.858333|2.348333|2000010106|0.6|2.0|NULL|
|48.858333|2.348333|2000010107|0.0|2.0|NULL|
|48.858333|2.348333|2000010108|0.0|3.0|NULL|
|48.858333|2.348333|2000010109|0.0|3.0|NULL|
|48.858333|2.348333|2000010110|0.0|4.0|NULL|
|48.858333|2.348333|2000010111|0.0|3.0|NULL|
|48.858333|2.348333|2000010112|0.0|4.0|NULL|
|48.858333|2.348333|2000010113|0.0|3.0|NULL|
|48.858333|2.348333|2000010114|0.0|3.0|NULL|
|48.858333|2.348333|2000010115|0.0|4.0|NULL|
|48.858333|2.348333|2000010116|0.0|3.0|NULL|
|48.858333|2.348333|2000010117|0.0|2.0|NULL|
|48.858333|2.348333|

In [10]:
data.check_all_columns_null()

Column 'LAT' has no null values.

Column 'LON' has no null values.

Column 'AAAAMMJJHH' has no null values.

Column 'RR1' has 84239 null values for 91.67374034171291.

Column 'FF' has 22060 null values for 24.006964849276308.

Column 'TN50' has 90170 null values for 98.12819675699205.



In [11]:
data.df.groupBy(['LAT']).count().show()

+---------+-----+
|      LAT|count|
+---------+-----+
|48.858333|90150|
|48.844667|    1|
|   48.835|    4|
|48.848167|    5|
|48.819833|    5|
|48.831667|    4|
|48.821667| 1721|
+---------+-----+



In [12]:
data.df.groupBy(['LON']).count().show()

+--------+-----+
|     LON|count|
+--------+-----+
|2.348333| 5954|
|  2.2945|84196|
|2.333833|    1|
|2.337833| 1721|
|2.380833|    5|
|  2.4375|    4|
|2.411667|    4|
|2.456833|    5|
+--------+-----+

