<a href="https://colab.research.google.com/github/Keizerbub/weather/blob/main/wrangling_weather.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import framework

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=520aa6a6e487c60f31c79b97c160abc54e00580f6a3b872e3d09030d3ea025e1
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
import pandas as pd
import os
import gzip
import shutil
from pyspark.sql import SparkSession

#ouverture des fichiers

In [3]:
"""ouverture des fichiers"""
class wrangling:
    def __init__(self):
        self.spark = SparkSession.builder \
            .appName("HandlingFile") \
            .getOrCreate()



    def open_file(self, file_path):
        try:
            # Read the file into a DataFrame
            self.df = self.spark.read.csv(file_path, header=True, inferSchema=True, sep=";")  # Example for reading a CSV file
            # If it's a different type of file, use the appropriate method like `spark.read.json()`, etc.

            # Show the DataFrame content
            self.df.show()

        except Exception as e:
            print("Error occurred:", e)



    def close_spark(self):
        # Stop the SparkSession
        self.spark.stop()



    def select_columns(self, *columns):
        try:
            # Select only the desired columns
            self.df = self.df.select(*columns)

            # Show the DataFrame content after selecting columns
            print("DataFrame after selecting columns:")
            self.df.show()

        except Exception as e:
            print("Error occurred:", e)



    def check_column_null(self, column_name):
        try:
            # Check if the column has any null values
            null_count = self.df.filter(self.df[column_name].isNull()).count()

            # Calculate the percentage of null values
            total_rows = self.df.count()
            self.null_percentage = (null_count / total_rows) * 100


            if null_count > 0:
                print(f"Column '{column_name}' has {null_count} null values for {self.null_percentage}.\n")
            else:
                print(f"Column '{column_name}' has no null values.\n")


        except Exception as e:
            print("Error occurred:", e)



    def check_all_columns_null(self):
      try:
          # Get all column names
          columns = self.df.columns

          # Apply check_column_null to each column
          for col in columns:
              self.check_column_null(col)
              #removing the missing col
              if self.null_percentage==100.0:
                  self.df = self.df.drop(col)
                  print(f'the column {col} was delete')

      except Exception as e:
          print("Error occurred:", e)

In [4]:
data=wrangling()

In [33]:
data.open_file(file_path="/content/H_975_combined.csv")

+-----------+---------+---------+----------+----+----------+---+----+----+-----+----+---+---+---+----+----+---+----+----+----+----+----+---+----+----+----+----+----+----+----+----+-----+----+-----+----+-----+-----+------+-----+------+------+-------+---+---+----+---+---+---+----+----+---+---+----+----+---+---+---+----+---+----+---+----+----+-----+-----+------+----+-----+---------+----------+-------+-------+---+---+---+---+----+----+---+---+----+----+-------+--------+-------+--------+---+----+------+-----+------+------+-------+-------+----+-----+---+---+----+-----+----+----+----+----+----+----+---+---+----+----+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-----+---+------+-------+---+---+----+----+----+----+----+----+-----+------+----+-----+-----+------+-------+--------+--------+---------+------+-------+------+-------+-------+--------+--------+---------+-------+--------+----------+----------+---------+----------+---------+----------+--

In [34]:
data.select_columns('LAT','LON','ALTI','AAAAMMJJHH','RR1','DRR1','FF','DD',' T','TD','DG','T10','TN50','TNSOL',' DHUMEC','U','UN','HUN','DHUMI40','DHUMI80','TSV','PSTAT','PMERMIN',' N','VV','DVV200','WW',' SOL','SOLNG','TMER','HNEIGEF','HNEIGEFI1','ESNEIGE','CHARGENEIGE','GLO','DIF','UV_INDICE','INFRAR','INS','ECOULEMENT')

DataFrame after selecting columns:
+---------+----------+----+----------+---+----+----+---+---+----+---+---+----+-----+-------+---+---+----+-------+-------+---+------+-------+---+-----+------+---+----+-----+----+-------+---------+-------+-----------+---+----+---------+------+---+----------+
|      LAT|       LON|ALTI|AAAAMMJJHH|RR1|DRR1|  FF| DD|  T|  TD| DG|T10|TN50|TNSOL| DHUMEC|  U| UN| HUN|DHUMI40|DHUMI80|TSV| PSTAT|PMERMIN|  N|   VV|DVV200| WW| SOL|SOLNG|TMER|HNEIGEF|HNEIGEFI1|ESNEIGE|CHARGENEIGE|GLO| DIF|UV_INDICE|INFRAR|INS|ECOULEMENT|
+---------+----------+----+----------+---+----+----+---+---+----+---+---+----+-----+-------+---+---+----+-------+-------+---+------+-------+---+-----+------+---+----+-----+----+-------+---------+-------+-----------+---+----+---------+------+---+----------+
|46.766333|-56.179167|  21|2020010100|0.0|   0| 8.9|140|0.7|-3.3|  0|2.3| 0.2| -0.4|   NULL| 74| 71|2342|      0|      0|4.8|1013.8| 1014.6|  8|38429|     0|  0|NULL| NULL|NULL|   NULL|     NULL

In [30]:
data.check_all_columns_null()

Column 'LAT' has no null values.

Column 'LON' has no null values.

Column 'ALTI' has no null values.

Column 'AAAAMMJJHH' has no null values.

Column 'RR1' has 273408 null values for 15.791963431180186.

Column 'DRR1' has 1597259 null values for 92.25719700273378.

Column 'FF' has 756569 null values for 43.69919673588396.

Column 'DD' has 757376 null values for 43.745808811935.

Column ' T' has 76435 null values for 4.414862494375649.

Column 'TD' has 424736 null values for 24.532622966064444.

Column 'DG' has 382571 null values for 22.09718531217095.

Column 'T10' has 1581103 null values for 91.32403132654964.

Column 'TN50' has 1598359 null values for 92.3207326702135.

Column 'TNSOL' has 1461480 null values for 84.41464300752436.

Column ' DHUMEC' has 1731263 null values for 99.99722753450997.

Column 'U' has 424642 null values for 24.527193554479812.

Column 'UN' has 704733 null values for 40.70516504544822.

Column 'HUN' has 704878 null values for 40.713540201616.

Column 'DHUMI4

In [31]:
data.df.groupBy(['RR1']).count().show()

+----+-------+
| RR1|  count|
+----+-------+
|13.4|     15|
|26.7|      1|
|15.4|      7|
| 2.4|   2515|
| 8.0|     74|
|10.2|     27|
|18.3|      1|
| 0.0|1289307|
| 5.4|    280|
|16.6|      6|
|11.4|     22|
| 7.0|    134|
| 3.5|     74|
| 6.1|     63|
| 9.5|     11|
|11.6|     15|
| 7.7|     29|
| 0.2|  52777|
| 6.6|    149|
| 8.7|     25|
+----+-------+
only showing top 20 rows



In [32]:
data.df.groupBy(['LON']).count().show()

+--------+------+
|     LON| count|
+--------+------+
|   5.688|159554|
|5.287667|174954|
|6.080333|153187|
|5.814167|174519|
|5.484167|  5715|
|5.329333|175305|
|   5.669|138135|
|   5.482|122620|
|5.968333| 22893|
|5.469333| 44358|
|5.373833|  5362|
|5.073333|116301|
|5.661667| 47911|
|6.139667|  5599|
|5.109833|  7245|
|5.426667|  5208|
|5.717167|  5282|
|5.699833|  3839|
|   5.461|116175|
|5.572833|  5193|
+--------+------+
only showing top 20 rows

