In [1]:
from hdfs import InsecureClient
import pandas as pd
import numpy as np
import re
import seaborn as sns

#### Koneksi ke Hadoop

In [2]:
client_hdfs = InsecureClient('http://127.0.0.1:9870', user="hduser")

#### Collecting data dari data source

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/Ibrahimsyah/PBD_FinalProject/master/weatherHistory.csv')

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Formatted Date            96453 non-null  object 
 1   Summary                   96453 non-null  object 
 2   Precip Type               95936 non-null  object 
 3   Temperature (C)           96453 non-null  float64
 4   Apparent Temperature (C)  96453 non-null  float64
 5   Humidity                  96453 non-null  float64
 6   Wind Speed (km/h)         96453 non-null  float64
 7   Wind Bearing (degrees)    96453 non-null  float64
 8   Visibility (km)           96453 non-null  float64
 9   Loud Cover                96453 non-null  float64
 10  Pressure (millibars)      96453 non-null  float64
 11  Daily Summary             96453 non-null  object 
dtypes: float64(8), object(4)
memory usage: 8.8+ MB
None


In [5]:
# Writing Dataframe to hdfs
with client_hdfs.write('/finalProject/input/weather.csv', encoding = 'utf-8', overwrite=True) as writer:
    df.to_csv(writer)

#### Mengambil data dari Hadoop

In [6]:
with client_hdfs.read('/finalProject/input/weather.csv', encoding = 'utf-8') as reader:
    df = pd.read_csv(reader,index_col=0)

#### Preprocessing Data

In [7]:
# Menghapus spasi dan tanda kurung

# Define mapper 
def mapper(x):
    x = x.replace(' ', '')
    x = re.sub(r'\([^()]*\)', '', x)
    return x

# Map setiap nama kolom untuk menghilangkan spasi dan kurung
newColumns = df.columns.map(mapper)
df.columns = newColumns
df.head(5)

Unnamed: 0,FormattedDate,Summary,PrecipType,Temperature,ApparentTemperature,Humidity,WindSpeed,WindBearing,Visibility,LoudCover,Pressure,DailySummary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96453 entries, 0 to 96452
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   FormattedDate        96453 non-null  object 
 1   Summary              96453 non-null  object 
 2   PrecipType           95936 non-null  object 
 3   Temperature          96453 non-null  float64
 4   ApparentTemperature  96453 non-null  float64
 5   Humidity             96453 non-null  float64
 6   WindSpeed            96453 non-null  float64
 7   WindBearing          96453 non-null  float64
 8   Visibility           96453 non-null  float64
 9   LoudCover            96453 non-null  float64
 10  Pressure             96453 non-null  float64
 11  DailySummary         96453 non-null  object 
dtypes: float64(8), object(4)
memory usage: 9.6+ MB


#### Generate DataFrame baru berisi data rata rata temperature, Humidity, WindSpeed, dan Pressure

In [10]:
dateDf = df.copy()
dateDf.FormattedDate = dateDf.FormattedDate.map(lambda x: x.split(' ')[0])

# Mencari Nilai Unik tiap Tanggal
dates = np.unique(dateDf.FormattedDate)

means = []

# Loop setiap tanggal unik
for date in dates:
    date1 = dateDf.loc[dateDf['FormattedDate'] == date]
    tempMean = np.mean(date1['Temperature'].values)
    humidMean = np.mean(date1['Humidity'].values)
    windSpeedMean = np.mean(date1['WindSpeed'].values)
    pressureMean = np.mean(date1['Pressure'].values)
    mean = {
        'date': date, 
        'temperatureMean': tempMean,
        'humidityMean': humidMean,
        'windSpeedMean': windSpeedMean,
        'pressureMean': pressureMean
    }
    means.append(mean)

In [11]:
meanDf = pd.DataFrame(means)
print(meanDf.info())
meanDf.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4018 entries, 0 to 4017
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             4018 non-null   object 
 1   temperatureMean  4018 non-null   float64
 2   humidityMean     4018 non-null   float64
 3   windSpeedMean    4018 non-null   float64
 4   pressureMean     4018 non-null   float64
dtypes: float64(4), object(1)
memory usage: 157.1+ KB
None


Unnamed: 0,date,temperatureMean,humidityMean,windSpeedMean,pressureMean
0,2006-01-01,3.873148,0.818333,21.37275,1012.279167
1,2006-01-02,5.418519,0.844583,17.551683,1010.131667
2,2006-01-03,2.319444,0.898333,8.417617,1020.805
3,2006-01-04,2.274074,0.905417,11.579925,981.826667
4,2006-01-05,2.698148,0.948333,9.5151,935.988333


In [12]:
# Casting tipe data date dari object ke datetime
meanDf['date'] = pd.to_datetime(meanDf['date'])
meanDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4018 entries, 0 to 4017
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   date             4018 non-null   datetime64[ns]
 1   temperatureMean  4018 non-null   float64       
 2   humidityMean     4018 non-null   float64       
 3   windSpeedMean    4018 non-null   float64       
 4   pressureMean     4018 non-null   float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 157.1 KB


#### Menyimpan data rata rata kedalam hadoop dengan path /finalProject/output/result.csv

In [13]:
with client_hdfs.write('/finalProject/output/result.csv', encoding = 'utf-8', overwrite=True) as writer:
    meanDf.to_csv(writer)