In [1]:
# https://github.com/johanna23cct/integrated-CA2-MSc-2023094.git

In [2]:
#git remote add origin https://github.com/johanna23cct/integrated-CA2-MSc-2023094.git
#git branch -M main
#git push -u origin main

In [3]:
pip install skforecast --user

Note: you may need to restart the kernel to use updated packages.


In [4]:
# Data manipulation
# ==============================================================================
import numpy as np
import pandas as pd

# Plots
# ==============================================================================
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.rcParams['lines.linewidth'] = 1.5
%matplotlib inline

# Modeling and Forecasting
# ==============================================================================
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregCustom import ForecasterAutoregCustom
#from skforecast.ForecasterAutoregMultiOutput import ForecasterAutoregMultiOutput
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster

from joblib import dump, load

# Spark
from pyspark.sql import SparkSession


# Warnings configuration
# ==============================================================================
import warnings
# warnings.filterwarnings('ignore')

In [5]:
# SparkContext HOW I CAN SHOW THAT ... SC  IS NOT WORKING TO ME!

In [6]:
# import SparkSession library 
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

spark = SparkSession.builder.appName('data_projectTweets').getOrCreate()

In [7]:
# creating a customer dataframe by declaring the schema and passing values
import pyspark.sql.functions as F
from pyspark.sql.types import *

# create custom Dataframe by declaring the schema

schema = StructType().add("Ids","integer").add("Date","string").add("Flag", "string").add("User",'string').add("Text", "string")

In [8]:
# insert the values
df = spark.createDataFrame([(1467810369, "Mon apr 06 22:19:45 PDT 2009", "NoQuery", "_theSpecialOne", "@switchfoot http://twitpic.com/2y1zl - Awww, t..."), (1467810672, "Mon Apr 06 22:19:49 PDT 2009", "NO_QUERY", "scotthamilton", "is upset that he can not update his Facebook b..." ), (1467811184, "Mon Apr 06 22:19:57 PDT 2009", "NO_QUERY", "mattycus","@Kenichan I dived many times for the ball. Man...")],schema=schema)

In [9]:
# Display the structure of Schema
df.printSchema()

root
 |-- Ids: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- Flag: string (nullable = true)
 |-- User: string (nullable = true)
 |-- Text: string (nullable = true)



In [10]:
# Show the data
df.show()

                                                                                

+----------+--------------------+--------+--------------+--------------------+
|       Ids|                Date|    Flag|          User|                Text|
+----------+--------------------+--------+--------------+--------------------+
|1467810369|Mon apr 06 22:19:...| NoQuery|_theSpecialOne|@switchfoot http:...|
|1467810672|Mon Apr 06 22:19:...|NO_QUERY| scotthamilton|is upset that he ...|
|1467811184|Mon Apr 06 22:19:...|NO_QUERY|      mattycus|@Kenichan I dived...|
+----------+--------------------+--------+--------------+--------------------+



In [11]:
# Create a new dataframe with nukll values 
df_na = spark.createDataFrame([(1467810369, "Mon apr 06 22:19:45 PDT 2009", None, "_theSpecialOne", "@switchfoot http://twitpic.com/2y1zl - Awww, t..."), (1467810672, "Mon Apr 06 22:19:49 PDT 2009", None, "scotthamilton", "is upset that he can not update his Facebook b..." ), (1467811184, "Mon Apr 06 22:19:57 PDT 2009", None, "mattycus","@Kenichan I dived many times for the ball. Man...")],schema=schema)

In [12]:
df_na.show()

+----------+--------------------+----+--------------+--------------------+
|       Ids|                Date|Flag|          User|                Text|
+----------+--------------------+----+--------------+--------------------+
|1467810369|Mon apr 06 22:19:...|null|_theSpecialOne|@switchfoot http:...|
|1467810672|Mon Apr 06 22:19:...|null| scotthamilton|is upset that he ...|
|1467811184|Mon Apr 06 22:19:...|null|      mattycus|@Kenichan I dived...|
+----------+--------------------+----+--------------+--------------------+



In [13]:
# fill all null values with 0
df_na.fillna('0').show()

+----------+--------------------+----+--------------+--------------------+
|       Ids|                Date|Flag|          User|                Text|
+----------+--------------------+----+--------------+--------------------+
|1467810369|Mon apr 06 22:19:...|   0|_theSpecialOne|@switchfoot http:...|
|1467810672|Mon Apr 06 22:19:...|   0| scotthamilton|is upset that he ...|
|1467811184|Mon Apr 06 22:19:...|   0|      mattycus|@Kenichan I dived...|
+----------+--------------------+----+--------------+--------------------+



In [14]:
# fill null values with specific value
df_na.fillna( { 'Ids':'1467811372', 'User':'elleCTF' } ).show()

+----------+--------------------+----+--------------+--------------------+
|       Ids|                Date|Flag|          User|                Text|
+----------+--------------------+----+--------------+--------------------+
|1467810369|Mon apr 06 22:19:...|null|_theSpecialOne|@switchfoot http:...|
|1467810672|Mon Apr 06 22:19:...|null| scotthamilton|is upset that he ...|
|1467811184|Mon Apr 06 22:19:...|null|      mattycus|@Kenichan I dived...|
+----------+--------------------+----+--------------+--------------------+



In [15]:
# Return new df omitting rows with null values
df_na.na.drop().show()

+---+----+----+----+----+
|Ids|Date|Flag|User|Text|
+---+----+----+----+----+
+---+----+----+----+----+



In [16]:
df_na.na.drop(subset='Flag').show()

+---+----+----+----+----+
|Ids|Date|Flag|User|Text|
+---+----+----+----+----+
+---+----+----+----+----+



In [17]:
df_na.replace("0","No Query").show()

+----------+--------------------+----+--------------+--------------------+
|       Ids|                Date|Flag|          User|                Text|
+----------+--------------------+----+--------------+--------------------+
|1467810369|Mon apr 06 22:19:...|null|_theSpecialOne|@switchfoot http:...|
|1467810672|Mon Apr 06 22:19:...|null| scotthamilton|is upset that he ...|
|1467811184|Mon Apr 06 22:19:...|null|      mattycus|@Kenichan I dived...|
+----------+--------------------+----+--------------+--------------------+



In [18]:
#deleting column 
df.drop('Flag').show()

+----------+--------------------+--------------+--------------------+
|       Ids|                Date|          User|                Text|
+----------+--------------------+--------------+--------------------+
|1467810369|Mon apr 06 22:19:...|_theSpecialOne|@switchfoot http:...|
|1467810672|Mon Apr 06 22:19:...| scotthamilton|is upset that he ...|
|1467811184|Mon Apr 06 22:19:...|      mattycus|@Kenichan I dived...|
+----------+--------------------+--------------+--------------------+



In [1]:
# big headache and fever
#    (command)+(file://)+(path)+(filename)

#df = spark.read.csv('/home/hduser/Documents/CA2', header=True, inferSchema=True)

In [19]:
#loading the dataset into a Dataframe


df = spark.read.csv("file:///home/hduser/Documents/CA2/ProjectTweets.csv", header=True, inferSchema = True)

                                                                                

In [26]:
column_names  = ['Ids', 'Date', 'Flag', 'User', 'Text']    
full_df = pd.read_csv('ProjectTweets.csv', header=None, names=column_names)

In [27]:
print(full_df.shape)

(1600000, 5)


In [28]:
# Count the number of records
full_df.count()

Ids     1600000
Date    1600000
Flag    1600000
User    1600000
Text    1600000
dtype: int64

In [29]:
full_df.head()

Unnamed: 0,Ids,Date,Flag,User,Text
0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [31]:
rows_df = pd.DataFrame(data, columns=['Ids', 'Date', 'Flag', 'User', 'Text'])

NameError: name 'data' is not defined

In [None]:
print(rows_df)

In [33]:
data = {
    'Ids': [1467810369, 1467810672, 1467811184],
    'Date': ['Mon Apr 06 22:19:45 PDT 2009', 'Mon Apr 06 22:19:49 PDT 2009', 'Mon Apr 06 22:19:57 PDT 2009'],
    'Flag': ['NO_QUERY', 'NO_QUERY', 'NO_QUERY'],
    'User': ['_TheSpecialOne_', 'scotthamilton', 'mattycus'],
    'Text': ['@switchfoot http://twitpic.com/2y1zl - Awww, that is a bummer. You shoulda got David Carr of Third Day to do it. ;D', 'is upset that he can not update his Facebook by ...', '@Kenichan I dived many times for the ball. Man...']
}

rows_df = pd.DataFrame(data, columns=['Ids', 'Data', 'Flag', 'User', 'Text'])

# Transpose the DataFrame (swap rows  and columns)
transposed_rows_df = rows_df.transpose()

# use the 'T' attribute to transpose the DataFrame
# transposed_df = df.T

print("Original DataFrame:")
print(rows_df)
print("\nTransposed DataFrame:")
print(transposed_rows_df)

Original DataFrame:
          Ids Data      Flag             User  \
0  1467810369  NaN  NO_QUERY  _TheSpecialOne_   
1  1467810672  NaN  NO_QUERY    scotthamilton   
2  1467811184  NaN  NO_QUERY         mattycus   

                                                Text  
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1  is upset that he can not update his Facebook b...  
2  @Kenichan I dived many times for the ball. Man...  

Transposed DataFrame:
                                                      0  \
Ids                                          1467810369   
Data                                                NaN   
Flag                                           NO_QUERY   
User                                    _TheSpecialOne_   
Text  @switchfoot http://twitpic.com/2y1zl - Awww, t...   

                                                      1  \
Ids                                          1467810672   
Data                                                NaN   
Flag    

In [34]:
rows_df.shape

(3, 5)