In [83]:
from pyspark.sql import SparkSession

In [84]:
spark = SparkSession.builder.appName('firstClass').getOrCreate()

In [85]:
# you can check you spark session information
spark

In [86]:
# read CSV using pySpark
# option('header','True') : will consider first row as column names
df_pyspark = spark.read.option('header','True').csv('Reddit_post.csv')

In [87]:
## to display data
df_pyspark.show()

+---+---------+--------------------+--------------------+-------+--------+-----+
|Num|     date|               title|              author|upvotes|downvote|karma|
+---+---------+--------------------+--------------------+-------+--------+-----+
|  0|2/23/2024|First time option...|          statusblue|      3|       1| 2311|
|  1|2/23/2024|The weekend Nvidi...| Maleficent-Oil-2848|     10|       3|  827|
|  2|2/23/2024|NASDAQ: ALVO - FD...|             1904___|      3|       2|  112|
|  3|2/23/2024|I thought this ki...|         psiphonblog|     21|       4|   94|
|  4|2/23/2024|  I burnt my tendies|   Fullspinalpackage|     44|       5|   61|
|  5|2/23/2024|       MARA earnings|           N_FLATION|     14|       3|  382|
|  6|2/23/2024|A lone developer ...|          Iky_Greenz|    123|      17| 6346|
|  7|2/23/2024|My All Time Robin...|    Savings_Lake8255|    588|      31|  271|
|  8|2/23/2024|  Unsatisfied (221%)|          schbloimps|     37|      12| 1829|
|  9|2/23/2024|Unfortunately

In [88]:
## To print the data type of csv file
# by deafault it will consider String
df_pyspark.printSchema()

root
 |-- Num: string (nullable = true)
 |-- date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author: string (nullable = true)
 |-- upvotes: string (nullable = true)
 |-- downvote: string (nullable = true)
 |-- karma: string (nullable = true)



In [89]:
## inferSchema=True : will assigned proper datatype to the columns
df_pyspark_with_Datatype = spark.read.option('header','True').csv('Reddit_post.csv',inferSchema=True)
df_pyspark_with_Datatype.printSchema()

root
 |-- Num: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author: string (nullable = true)
 |-- upvotes: integer (nullable = true)
 |-- downvote: integer (nullable = true)
 |-- karma: integer (nullable = true)



In [90]:
# to print selected number of first rows 
df_pyspark_with_Datatype.head(4)

[Row(Num=0, date='2/23/2024', title='First time options player ', author='statusblue', upvotes=3, downvote=1, karma=2311),
 Row(Num=1, date='2/23/2024', title='The weekend Nvidia play', author='Maleficent-Oil-2848', upvotes=10, downvote=3, karma=827),
 Row(Num=2, date='2/23/2024', title='NASDAQ: ALVO - FDA Approved  SIMLANDI drug ', author='1904___', upvotes=3, downvote=2, karma=112),
 Row(Num=3, date='2/23/2024', title='I thought this kind of gains only happened to other people ($PANW)', author='psiphonblog', upvotes=21, downvote=4, karma=94)]

In [91]:
# to print selected single column
df_pyspark_with_Datatype.select('date').show()


+---------+
|     date|
+---------+
|2/23/2024|
|2/23/2024|
|2/23/2024|
|2/23/2024|
|2/23/2024|
|2/23/2024|
|2/23/2024|
|2/23/2024|
|2/23/2024|
|2/23/2024|
|2/23/2024|
|2/23/2024|
|2/23/2024|
|2/23/2024|
|2/23/2024|
|2/23/2024|
|2/23/2024|
|2/23/2024|
|2/23/2024|
|2/23/2024|
+---------+
only showing top 20 rows



In [92]:
# to print selected multiple columns
df_pyspark_with_Datatype.select(['date','karma']).show()

+---------+-----+
|     date|karma|
+---------+-----+
|2/23/2024| 2311|
|2/23/2024|  827|
|2/23/2024|  112|
|2/23/2024|   94|
|2/23/2024|   61|
|2/23/2024|  382|
|2/23/2024| 6346|
|2/23/2024|  271|
|2/23/2024| 1829|
|2/23/2024|28355|
|2/23/2024|   16|
|2/23/2024| 3478|
|2/23/2024|  282|
|2/23/2024|   86|
|2/23/2024|11527|
|2/23/2024|   82|
|2/23/2024| 3234|
|2/23/2024|  706|
|2/23/2024| 4367|
|2/23/2024| 7901|
+---------+-----+
only showing top 20 rows



In [93]:
# to check the data type 
df_pyspark_with_Datatype.dtypes

[('Num', 'int'),
 ('date', 'string'),
 ('title', 'string'),
 ('author', 'string'),
 ('upvotes', 'int'),
 ('downvote', 'int'),
 ('karma', 'int')]

In [94]:
#descriptive Statistics
df_pyspark_with_Datatype.describe().show()

+-------+------------------+---------+--------------------+-------------------+-----------------+------------------+------------------+
|summary|               Num|     date|               title|             author|          upvotes|          downvote|             karma|
+-------+------------------+---------+--------------------+-------------------+-----------------+------------------+------------------+
|  count|                51|       51|                  51|                 51|               51|                51|                51|
|   mean|              25.0|     NULL|                NULL|               NULL|96.33333333333333|10.549019607843137|23587.450980392157|
| stddev|14.866068747318506|     NULL|                NULL|               NULL|230.6507027231148| 18.29023097228703| 136864.3162387207|
|    min|                 0|2/23/2024|$125 --> $20k YTD...|            1904___|                0|                 0|                10|
|    max|                50|2/23/2024|🚨 🐻 Bear A

In [95]:
## adding column in data frames
# for ex. Num+2 i did for testing
df_pyspark_with_Datatype.withColumn('Num after Increment of 2 ', df_pyspark_with_Datatype['Num']+2).show()

+---+---------+--------------------+--------------------+-------+--------+-----+-------------------------+
|Num|     date|               title|              author|upvotes|downvote|karma|Num after Increment of 2 |
+---+---------+--------------------+--------------------+-------+--------+-----+-------------------------+
|  0|2/23/2024|First time option...|          statusblue|      3|       1| 2311|                        2|
|  1|2/23/2024|The weekend Nvidi...| Maleficent-Oil-2848|     10|       3|  827|                        3|
|  2|2/23/2024|NASDAQ: ALVO - FD...|             1904___|      3|       2|  112|                        4|
|  3|2/23/2024|I thought this ki...|         psiphonblog|     21|       4|   94|                        5|
|  4|2/23/2024|  I burnt my tendies|   Fullspinalpackage|     44|       5|   61|                        6|
|  5|2/23/2024|       MARA earnings|           N_FLATION|     14|       3|  382|                        7|
|  6|2/23/2024|A lone developer ...| 

In [96]:
##  to drop the column 
df_pyspark_with_Datatype.drop('title').show()

+---+---------+--------------------+-------+--------+-----+
|Num|     date|              author|upvotes|downvote|karma|
+---+---------+--------------------+-------+--------+-----+
|  0|2/23/2024|          statusblue|      3|       1| 2311|
|  1|2/23/2024| Maleficent-Oil-2848|     10|       3|  827|
|  2|2/23/2024|             1904___|      3|       2|  112|
|  3|2/23/2024|         psiphonblog|     21|       4|   94|
|  4|2/23/2024|   Fullspinalpackage|     44|       5|   61|
|  5|2/23/2024|           N_FLATION|     14|       3|  382|
|  6|2/23/2024|          Iky_Greenz|    123|      17| 6346|
|  7|2/23/2024|    Savings_Lake8255|    588|      31|  271|
|  8|2/23/2024|          schbloimps|     37|      12| 1829|
|  9|2/23/2024|   chewbaccashotlast|     48|      15|28355|
| 10|2/23/2024|        Broken_Wedge|      6|       4|   16|
| 11|2/23/2024|   Worried_Creme8917|     30|       7| 3478|
| 12|2/23/2024|          harrybuice|     14|       5|  282|
| 13|2/23/2024|        M_from_Vegas|    

In [97]:
##  to Rename the column 
df_pyspark_with_Datatype.withColumnRenamed('author','Writer').show()

+---+---------+--------------------+--------------------+-------+--------+-----+
|Num|     date|               title|              Writer|upvotes|downvote|karma|
+---+---------+--------------------+--------------------+-------+--------+-----+
|  0|2/23/2024|First time option...|          statusblue|      3|       1| 2311|
|  1|2/23/2024|The weekend Nvidi...| Maleficent-Oil-2848|     10|       3|  827|
|  2|2/23/2024|NASDAQ: ALVO - FD...|             1904___|      3|       2|  112|
|  3|2/23/2024|I thought this ki...|         psiphonblog|     21|       4|   94|
|  4|2/23/2024|  I burnt my tendies|   Fullspinalpackage|     44|       5|   61|
|  5|2/23/2024|       MARA earnings|           N_FLATION|     14|       3|  382|
|  6|2/23/2024|A lone developer ...|          Iky_Greenz|    123|      17| 6346|
|  7|2/23/2024|My All Time Robin...|    Savings_Lake8255|    588|      31|  271|
|  8|2/23/2024|  Unsatisfied (221%)|          schbloimps|     37|      12| 1829|
|  9|2/23/2024|Unfortunately

In [98]:
## ---------------------- Handling Missing Values ------------------------------------- ##
# Droping Rows
# Various Parameters in dropping functionalities
# Handling Missing Values by Mean

In [99]:
df_pyspark_with_Datatype.na.drop().show()
# ---------- drop(how='all') || if the entire row value is null then only it will drop the row. (just like AND operator)
# df_pyspark_with_Datatype.na.drop(how='all').show()

# ---------- drop(how='any') || if the at least one value of row is null then it will drop the row. (just like OR operator)
# df_pyspark_with_Datatype.na.drop(how='any').show()

# ---------- drop(how='any',thresh=2) || it will drop the row if row contains more than 2 null values. you can change thresh value as per requirements. (Null > thresh)
# df_pyspark_with_Datatype.na.drop(how='any',thresh=2).show()

# ---------- Subset | if declared column has null value then it will drop the row
# df_pyspark_with_Datatype.na.drop(how='any',thresh=2, subset=['column_Name']).show()

+---+---------+--------------------+--------------------+-------+--------+-----+
|Num|     date|               title|              author|upvotes|downvote|karma|
+---+---------+--------------------+--------------------+-------+--------+-----+
|  0|2/23/2024|First time option...|          statusblue|      3|       1| 2311|
|  1|2/23/2024|The weekend Nvidi...| Maleficent-Oil-2848|     10|       3|  827|
|  2|2/23/2024|NASDAQ: ALVO - FD...|             1904___|      3|       2|  112|
|  3|2/23/2024|I thought this ki...|         psiphonblog|     21|       4|   94|
|  4|2/23/2024|  I burnt my tendies|   Fullspinalpackage|     44|       5|   61|
|  5|2/23/2024|       MARA earnings|           N_FLATION|     14|       3|  382|
|  6|2/23/2024|A lone developer ...|          Iky_Greenz|    123|      17| 6346|
|  7|2/23/2024|My All Time Robin...|    Savings_Lake8255|    588|      31|  271|
|  8|2/23/2024|  Unsatisfied (221%)|          schbloimps|     37|      12| 1829|
|  9|2/23/2024|Unfortunately

In [100]:
## --------- Filling the missing values ----------------
df_pyspark_with_Datatype.na.fill('Missing_value').show()

## --------- Filling the misssing value for particular single column -----------------
# df_pyspark_with_Datatype.na.fill('Missing_value','title').show()

## --------- Filling the misssing value for particular multiple column -----------------
# df_pyspark_with_Datatype.na.fill('Missing_value',['title','date']).show()

+---+---------+--------------------+--------------------+-------+--------+-----+
|Num|     date|               title|              author|upvotes|downvote|karma|
+---+---------+--------------------+--------------------+-------+--------+-----+
|  0|2/23/2024|First time option...|          statusblue|      3|       1| 2311|
|  1|2/23/2024|The weekend Nvidi...| Maleficent-Oil-2848|     10|       3|  827|
|  2|2/23/2024|NASDAQ: ALVO - FD...|             1904___|      3|       2|  112|
|  3|2/23/2024|I thought this ki...|         psiphonblog|     21|       4|   94|
|  4|2/23/2024|  I burnt my tendies|   Fullspinalpackage|     44|       5|   61|
|  5|2/23/2024|       MARA earnings|           N_FLATION|     14|       3|  382|
|  6|2/23/2024|A lone developer ...|          Iky_Greenz|    123|      17| 6346|
|  7|2/23/2024|My All Time Robin...|    Savings_Lake8255|    588|      31|  271|
|  8|2/23/2024|  Unsatisfied (221%)|          schbloimps|     37|      12| 1829|
|  9|2/23/2024|Unfortunately