In [1]:
# https://github.com/johanna23cct/integrated-CA2-MSc-2023094.git

In [2]:
#git remote add origin https://github.com/johanna23cct/integrated-CA2-MSc-2023094.git
#git branch -M main
#git push -u origin main

In [3]:
pip install Matplotlib

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install bokeh

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install skforecast --user

Note: you may need to restart the kernel to use updated packages.


In [6]:
# Data manipulation
# ==============================================================================
import numpy as np
import pandas as pd

# Plots
# ==============================================================================
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.rcParams['lines.linewidth'] = 1.5
%matplotlib inline

# Modeling and Forecasting
# ==============================================================================
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregCustom import ForecasterAutoregCustom
#from skforecast.ForecasterAutoregMultiOutput import ForecasterAutoregMultiOutput
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster

from joblib import dump, load

# Spark
# import SparkSession library 
from pyspark.sql import SparkSession
from pyspark.sql.functions import lower, regexp_replace
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from pyspark.sql.types import *
import pyspark.sql.types as typ
import pyspark.sql.functions as fn
import pyspark.sql.functions as F

# Warnings configuration
# ==============================================================================
import warnings
# warnings.filterwarnings('ignore')

In [7]:
# SparkContext HOW I CAN SHOW THAT ... SC  It is not working to me, in the first time!
# that ok now, I know, is sc.master.  I was put only sc
sc.master

'local[*]'

In [8]:

spark = SparkSession.builder.appName('data_projectTweets').getOrCreate()

In [9]:
# creating a customer dataframe by declaring the schema and passing values


In [10]:
schema = StructType().add("Ids","integer").add("Date","string").add("Flag", "string").add("User",'string').add("Text", "string")

In [11]:
# insert the values
df = spark.createDataFrame([
        (1467810369, "Mon apr 06 22:19:45 PDT 2009", "NO_Query", "_theSpecialOne", "@switchfoot http://twitpic.com/2y1zl - Awww, t..."), 
        (1467810672, "Mon Apr 06 22:19:49 PDT 2009", "NO_Query", "scotthamilton", "is upset that he can not update his Facebook b..." ),
        (1467811184, "Mon Apr 06 22:19:57 PDT 2009", "NO_Query", "mattycus", "@Kenichan I dived many times for the ball. Man..."),
    ], ['Ids', 'Date', 'Flag', 'User', 'Text'])

In [12]:
# Check for Duplex

In [13]:
print('Count of row: {0}' .format(df.count()))
print('Count of distinct rows: {0}' .format(df.distinct().count()))

                                                                                

Count of row: 3




Count of distinct rows: 3


                                                                                

In [14]:
#If these two numbers differ - you have rows that are exact copies of each other. We can drop these rows by using the .dropDuplicates(...) method.

In [15]:
df = df.dropDuplicates()
df.show()

+----------+--------------------+--------+--------------+--------------------+
|       Ids|                Date|    Flag|          User|                Text|
+----------+--------------------+--------+--------------+--------------------+
|1467810369|Mon apr 06 22:19:...|NO_Query|_theSpecialOne|@switchfoot http:...|
|1467810672|Mon Apr 06 22:19:...|NO_Query| scotthamilton|is upset that he ...|
|1467811184|Mon Apr 06 22:19:...|NO_Query|      mattycus|@Kenichan I dived...|
+----------+--------------------+--------+--------------+--------------------+



In [16]:
#Let's confirm.

In [17]:
print('Count of ids: {0}'.format(df.count()))
print('Count of distinct ids: {0}'.format(df.select([c for c in df.columns if c != 'id']).distinct().count()))

Count of ids: 3
Count of distinct ids: 3


In [18]:
#We still have one more duplicate. We will use the .dropDuplicates(...) but add the subset parameter.

In [19]:
df = df.dropDuplicates(subset=[c for c in df.columns if c != 'id'])
df.show()

+----------+--------------------+--------+--------------+--------------------+
|       Ids|                Date|    Flag|          User|                Text|
+----------+--------------------+--------+--------------+--------------------+
|1467810369|Mon apr 06 22:19:...|NO_Query|_theSpecialOne|@switchfoot http:...|
|1467810672|Mon Apr 06 22:19:...|NO_Query| scotthamilton|is upset that he ...|
|1467811184|Mon Apr 06 22:19:...|NO_Query|      mattycus|@Kenichan I dived...|
+----------+--------------------+--------+--------------+--------------------+



In [20]:
#To calculate the total and distinct number of IDs in one step we can use the .agg(...) method

In [21]:
import pyspark.sql.functions as fn

df.agg(
    fn.count('ids').alias('count'),
    fn.countDistinct('ids').alias('distinct')
).show()

+-----+--------+
|count|distinct|
+-----+--------+
|    3|       3|
+-----+--------+



In [22]:
#Give each row a unique ID.

In [23]:
df.withColumn('new_id', fn.monotonically_increasing_id()).show()

+----------+--------------------+--------+--------------+--------------------+------+
|       Ids|                Date|    Flag|          User|                Text|new_id|
+----------+--------------------+--------+--------------+--------------------+------+
|1467810369|Mon apr 06 22:19:...|NO_Query|_theSpecialOne|@switchfoot http:...|     0|
|1467810672|Mon Apr 06 22:19:...|NO_Query| scotthamilton|is upset that he ...|     1|
|1467811184|Mon Apr 06 22:19:...|NO_Query|      mattycus|@Kenichan I dived...|     2|
+----------+--------------------+--------+--------------+--------------------+------+



In [24]:
# Display the structure of Schema
df.printSchema()

root
 |-- Ids: long (nullable = true)
 |-- Date: string (nullable = true)
 |-- Flag: string (nullable = true)
 |-- User: string (nullable = true)
 |-- Text: string (nullable = true)



In [25]:
# Show the data
df.show()

+----------+--------------------+--------+--------------+--------------------+
|       Ids|                Date|    Flag|          User|                Text|
+----------+--------------------+--------+--------------+--------------------+
|1467810369|Mon apr 06 22:19:...|NO_Query|_theSpecialOne|@switchfoot http:...|
|1467810672|Mon Apr 06 22:19:...|NO_Query| scotthamilton|is upset that he ...|
|1467811184|Mon Apr 06 22:19:...|NO_Query|      mattycus|@Kenichan I dived...|
+----------+--------------------+--------+--------------+--------------------+



In [26]:
# Create a new dataframe with nukll values 
df_na = spark.createDataFrame([(1467810369, "Mon apr 06 22:19:45 PDT 2009", None, "_theSpecialOne", "@switchfoot http://twitpic.com/2y1zl - Awww, t..."), (1467810672, "Mon Apr 06 22:19:49 PDT 2009", None, "scotthamilton", "is upset that he can not update his Facebook b..." ), (1467811184, "Mon Apr 06 22:19:57 PDT 2009", None, "mattycus","@Kenichan I dived many times for the ball. Man...")],schema=schema)

In [27]:
df_na.show()

+----------+--------------------+----+--------------+--------------------+
|       Ids|                Date|Flag|          User|                Text|
+----------+--------------------+----+--------------+--------------------+
|1467810369|Mon apr 06 22:19:...|null|_theSpecialOne|@switchfoot http:...|
|1467810672|Mon Apr 06 22:19:...|null| scotthamilton|is upset that he ...|
|1467811184|Mon Apr 06 22:19:...|null|      mattycus|@Kenichan I dived...|
+----------+--------------------+----+--------------+--------------------+



In [28]:
#+---+------+------+----+------+
#| id|weight|height| age|gender|
# fill all null values with 0
df_na.fillna('0').show()

+----------+--------------------+----+--------------+--------------------+
|       Ids|                Date|Flag|          User|                Text|
+----------+--------------------+----+--------------+--------------------+
|1467810369|Mon apr 06 22:19:...|   0|_theSpecialOne|@switchfoot http:...|
|1467810672|Mon Apr 06 22:19:...|   0| scotthamilton|is upset that he ...|
|1467811184|Mon Apr 06 22:19:...|   0|      mattycus|@Kenichan I dived...|
+----------+--------------------+----+--------------+--------------------+



In [29]:
# fill null values with specific value
df_na.fillna( { 'Ids':'1467811372', 'User':'elleCTF' } ).show()

+----------+--------------------+----+--------------+--------------------+
|       Ids|                Date|Flag|          User|                Text|
+----------+--------------------+----+--------------+--------------------+
|1467810369|Mon apr 06 22:19:...|null|_theSpecialOne|@switchfoot http:...|
|1467810672|Mon Apr 06 22:19:...|null| scotthamilton|is upset that he ...|
|1467811184|Mon Apr 06 22:19:...|null|      mattycus|@Kenichan I dived...|
+----------+--------------------+----+--------------+--------------------+



In [30]:
# Return new df omitting rows with null values
df_na.na.drop().show()

+---+----+----+----+----+
|Ids|Date|Flag|User|Text|
+---+----+----+----+----+
+---+----+----+----+----+



In [31]:
df_na.na.drop(subset='Flag').show()

+---+----+----+----+----+
|Ids|Date|Flag|User|Text|
+---+----+----+----+----+
+---+----+----+----+----+



In [32]:
df_na.replace("0","No Query").show()

+----------+--------------------+----+--------------+--------------------+
|       Ids|                Date|Flag|          User|                Text|
+----------+--------------------+----+--------------+--------------------+
|1467810369|Mon apr 06 22:19:...|null|_theSpecialOne|@switchfoot http:...|
|1467810672|Mon Apr 06 22:19:...|null| scotthamilton|is upset that he ...|
|1467811184|Mon Apr 06 22:19:...|null|      mattycus|@Kenichan I dived...|
+----------+--------------------+----+--------------+--------------------+



In [33]:
#deleting column 
df.drop('Flag').show()

+----------+--------------------+--------------+--------------------+
|       Ids|                Date|          User|                Text|
+----------+--------------------+--------------+--------------------+
|1467810369|Mon apr 06 22:19:...|_theSpecialOne|@switchfoot http:...|
|1467810672|Mon Apr 06 22:19:...| scotthamilton|is upset that he ...|
|1467811184|Mon Apr 06 22:19:...|      mattycus|@Kenichan I dived...|
+----------+--------------------+--------------+--------------------+



In [34]:
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
## Understand your data


In [35]:
# big headache and fever to read the dataset
#    (command)+(file://)+(path)+(filename)

#df = spark.read.csv('/home/hduser/Documents/CA2', header=True, inferSchema=True)

In [56]:
#folowiong you can see the two way to load the dataset, 
#I will use direct from the path, to have one of five V' (Velocity) 

path = "/user1/people.json"
#df = spark.read.csv("file:///home/hduser/Documents/CA2/ProjectTweets.csv", header=True, inferSchema = True)

In [37]:
#reading the data. i can't belive, it is working

In [38]:
tweets = sc.textFile("file:///home/hduser/Documents/CA2/ProjectTweets.csv")
header = tweets.first()

tweets = tweets \
    .filter(lambda row: row != header) \
    .map(lambda row: [int(elem) for elem in row.split(',')])

                                                                                

In [39]:
# creating the schema for my DataFrame
# was one  error because i didn' import pyspark.sql.types as typ
# but next did it, the command worked, great!

In [40]:
fields = [
    *[
        typ.StructField(h[1:-1], typ.IntegerType(), True)
        for h in header.split(',')
    ]
]
schema = typ.StructType(fields)

In [41]:
column_names  = ['Ids', 'Date', 'Flag', 'User', 'Text']    
full_df = pd.read_csv('file:///home/hduser/Documents/CA2/ProjectTweets.csv', header=None, names=column_names)

In [42]:
full_df.head()

Unnamed: 0,Ids,Date,Flag,User,Text
0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [43]:
print(full_df.shape)

(1600000, 5)


In [44]:
# Count the number of records
full_df.count()

Ids     1600000
Date    1600000
Flag    1600000
User    1600000
Text    1600000
dtype: int64

In [45]:
# Creating my Dataframe (i did it in line 27 )

In [46]:
# tweets_df = spark.createDataFrame(tweets, schema)

In [47]:
# tweets_df.printSchema()

In [48]:
#For categorical columns we will count the frequencies of their values using .groupby(...) method.

In [49]:
#tweets_df.groupby('46781036457').count().show()  ***not working ***

In [50]:
# well, lest go to make  the basic descriptive statistics for our dataset

In [51]:
#For the truly numerical features we can use the .describe() method.

In [52]:
#in line 11 import SparkSession from PySpark and definned Spark =SparkSession.....
#in line 13 defined the schema INT and String
#in line 30 Lod Dataset. CSV.. df= spark.read.csv
# inline 27 Display the structure of Schema

In [53]:
#Data Procesising

In [55]:
df = df.withColumn("Text", lower(df["text"]))
df = df.withColumn("Text", regexp_replace(df["text"], "[^a-z0-9\\s]", ""))

AnalysisException: Cannot resolve column name "text" among (0, 1467810369, Mon Apr 06 22:19:45 PDT 2009, NO_QUERY, _TheSpecialOne_, @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D)

In [34]:
rows_df.shape

(3, 5)