In [None]:
# Conectamos con el google drive 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Instalamos las dependencias de nuestro entorno de trabajo 
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://apache.osuosl.org/spark/spark-3.1.3/spark-3.1.3-bin-hadoop2.7.tgz
!tar xf spark-3.1.3-bin-hadoop2.7.tgz
!pip install -q findspark
!pip install koalas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting koalas
  Downloading koalas-1.8.2-py3-none-any.whl (390 kB)
[K     |████████████████████████████████| 390 kB 28.8 MB/s 
Installing collected packages: koalas
Successfully installed koalas-1.8.2


In [None]:
!ls -a

.   .config  sample_data		spark-3.1.3-bin-hadoop2.7.tgz
..  drive    spark-3.1.3-bin-hadoop2.7


In [None]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.3-bin-hadoop2.7"

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib as plt
%matplotlib inline
import databricks.koalas as ks
from pathlib import Path



In [None]:
def importar(path:Path, format:str): 
  '''
  This function imports files with spark and transforms them into DataFrame using the koala library

  Arguments:
    :: path: 'path' path where the file is stored
    :: format: 'str' file format 

  Returns: 
  ---------
  Dataframe and print shape 
  '''

  df = spark.read.load(path, format=format)
  df = df.to_koalas()
  print(df.shape)

  return df


In [None]:
# Import all the datasets and print their shape
review = importar("/content/drive/MyDrive/HENRY_TRABAJO_GRUPAL/Dataset_Yelp/review.json", "json")
user = importar("/content/drive/MyDrive/HENRY_TRABAJO_GRUPAL/Dataset_Yelp/user.json", "json")
business = importar("/content/drive/MyDrive/HENRY_TRABAJO_GRUPAL/Dataset_Yelp/business.json", "json")
chechin = importar("/content/drive/MyDrive/HENRY_TRABAJO_GRUPAL/Dataset_Yelp/checkin.json", "json")
tip= importar('/content/drive/MyDrive/HENRY_TRABAJO_GRUPAL/Dataset_Yelp/tip.json', "json")

(6990280, 9)
(1987897, 22)
(150346, 14)
(131930, 2)
(908915, 5)


In [None]:
dataset_dict = {'review':review , 
                'user': user, 
                'business':business, 
                'chechin':chechin, 
                'tip': tip}

In [None]:
tip.columns

Index(['business_id', 'compliment_count', 'date', 'text', 'user_id'], dtype='object')

In [None]:
tip.info()

<class 'databricks.koalas.frame.DataFrame'>
Int64Index: 908915 entries, 0 to 908914
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   business_id       908915 non-null  object
 1   compliment_count  908915 non-null  int64 
 2   date              908915 non-null  object
 3   text              908915 non-null  object
 4   user_id           908915 non-null  object
dtypes: int64(1), object(4)

In [None]:
review.info()

<class 'databricks.koalas.frame.DataFrame'>
Int64Index: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   business_id  6990280 non-null  object 
 1   cool         6990280 non-null  int64  
 2   date         6990280 non-null  object 
 3   funny        6990280 non-null  int64  
 4   review_id    6990280 non-null  object 
 5   stars        6990280 non-null  float64
 6   text         6990280 non-null  object 
 7   useful       6990280 non-null  int64  
 8   user_id      6990280 non-null  object 
dtypes: float64(1), int64(3), object(5)

## EDA de Review

In [None]:
def unique_values(data): 
  '''
  This function detects the unique values in the dataset

  Arguments:
    :: data: 'DataFrame' 

  Return: 
  ---------
  Number of unique values and display
  '''

  columns = data.columns 

  print('----Unique Values in DataFrame----','\n',data.nunique())
  unique_values_viz = data.nunique().plot(kind = 'bar', title='Number of Unique Values in DataFrame')

  for column in columns: 
    if data[column].nunique() < 20:
      unique_values = data[column].unique()
      print('\n''----Unique Values in column----','\n',unique_values)

  return unique_values_viz

In [None]:
unique_values(tip)

----Unique Values in DataFrame---- 
 business_id         106193
compliment_count         7
date                906694
text                851033
user_id             301758
dtype: int64

----Unique Values in column---- 
 0    0
1    6
2    5
3    1
4    3
5    2
6    4
Name: compliment_count, dtype: int64


 - shape
 - info()
 - describe()
 - isnull().sum()
 - Duplicate.sum()
 - value_counts()

 - Outiliers --> Visualizar outliers

**Transformaciones**





### EDA tip

**Descripción de datos de columna:**

// string, texto del tip
"text": "Secret menu - fried chicken sando is da bombbbbbb Their zapatos are good too.",

// string, fecha cuando se escribio el tip YYYY-MM-DD
"date": "2013-09-20",

// entero, cuantos cumplidos totales tiene
"compliment_count": 172,

// string, 22 caracteres, id del negocio que se refiere al negocio en business.json
"business_id": "tnhfDv5Il8EaGSXZGiuQGg",

// string, 22 caracteres de id de usuario, que se refieren al usuario en user.json
"user_id": "49JhAJh8vSQ-vM4Aourl0g"

In [None]:
tip.head()

Unnamed: 0,business_id,compliment_count,date,text,user_id
0,3uLgwr0qeCNMjKenHJwPGQ,0,2012-05-18 02:17:21,Avengers time with the ladies.,AGNUgVwnZUey3gcPCJ76iw
1,QoezRbYQncpRqyrLH6Iqjg,0,2013-02-05 18:35:10,They have lots of good deserts and tasty cuban...,NBN4MgHP9D3cw--SnauTkA
2,MYoRNLb5chwjQe3c_k37Gg,0,2013-08-18 00:56:08,It's open even when you think it isn't,-copOvldyKh1qr-vzkDEvw
3,hV-bABTK-glh5wj31ps_Jw,0,2017-06-27 23:05:38,Very decent fried chicken,FjMQVZjSqY8syIO-53KFKw
4,_uN0OudeJ3Zl_tf6nxg5ww,0,2012-10-06 19:43:09,Appetizers.. platter special for lunch,ld0AperBXk1h6UbqmM80zw


In [None]:
tip.shape

(908915, 5)

In [None]:
tip.info()

<class 'databricks.koalas.frame.DataFrame'>
Int64Index: 908915 entries, 0 to 908914
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   business_id       908915 non-null  object
 1   compliment_count  908915 non-null  int64 
 2   date              908915 non-null  object
 3   text              908915 non-null  object
 4   user_id           908915 non-null  object
dtypes: int64(1), object(4)

In [None]:
tip['length_text'] = tip['text'].apply(len)


In Python 3.6+ and Spark 3.0+, it is preferred to specify type hints for pandas UDF instead of specifying pandas UDF type which will be deprecated in the future releases. See SPARK-28264 for more details.



In [None]:
tip['length_text'].max()

500

In [None]:
tip['length_text'].hist(bins=10, title='Longitud de texto')

In [None]:
tip.describe()

Unnamed: 0,compliment_count,length_text
count,908915.0,908915.0
mean,0.012525,62.580226
std,0.120763,57.828705
min,0.0,1.0
25%,0.0,28.0
50%,0.0,49.0
75%,0.0,76.0
max,6.0,500.0


In [None]:
tip['date'] = ks.to_datetime(tip['date'])


In Python 3.6+ and Spark 3.0+, it is preferred to specify type hints for pandas UDF instead of specifying pandas UDF type which will be deprecated in the future releases. See SPARK-28264 for more details.



In [None]:
tip.info()

<class 'databricks.koalas.frame.DataFrame'>
Int64Index: 908915 entries, 0 to 908914
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype     
---  ------            --------------   -----     
 0   business_id       908915 non-null  object    
 1   compliment_count  908915 non-null  int64     
 2   date              908915 non-null  datetime64
 3   text              908915 non-null  object    
 4   user_id           908915 non-null  object    
 5   length_text       908915 non-null  int64     
dtypes: datetime64(1), int64(2), object(3)

In [None]:
tip.nunique().plot(kind = 'bar', title='Cantidad de Valores únicos')

In [None]:
tip.unique().plot(kind = 'bar', title='Cantidad de Valores únicos')

AttributeError: ignored

In [None]:
tip.isnull().sum()

In [None]:
tip['compliment_count'].value_counts()

In [None]:
tip['compliment_count'].value_counts().plot(kind = 'bar', title='Cantidad de valores únicos en compliment_count')

In [None]:
min = tip['date'].min()
max = tip['date'].max()

print(f'Datos desde: {min}, hasta: {max}')

In [None]:
tip['dates'] = ks.to_datetime(tip['date']).dt.date

In [None]:
tip['time'] = ks.to_datetime(tip['date']).dt.strftime('%H:%M:%S')

In [None]:
def is_empty(value):
  stripped = value.strip()
  if stripped == '':
    return 1
  else:
    return 0

In [None]:
tip['empty_text'] = tip.text.apply(is_empty)

In [None]:
tip.empty_text.value_counts()

## EDA de Review

In [None]:
user.head()

In [None]:
business.show()

In [None]:
null_counts = review.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c)for c in review.columns]).toPandas().to_dict(orient='records')
print(f"We have {sum(null_counts[0].values()):d} null values in this dataset.")

In [None]:
review.isna().sum()

In [None]:
bussiness.head()

In [None]:
chechin.info()

In [None]:
chechin.isna().sum()

In [None]:
chechin.head()

In [None]:
review.isna().sum()

In [None]:
review.info()

In [None]:
tip.isna().sum()

In [None]:
tip.head()

In [None]:
user.isna().sum()

In [None]:
package com.dkl.leanring.spark.sql
 
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.SaveMode
import java.util.Properties
 
# /**
#    * Lea los datos de USER_T.csv e insértelos en la tabla mysql
#  */
object MysqlInsertDemo {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("MysqlInsertDemo").master("local").getOrCreate()
    val df = spark.read.option("header", "true").csv("src/main/resources/scala/USER_T.csv")
    df.show()
    val url = "jdbc:mysql://192.168.44.128:3306/hive?useUnicode=true&characterEncoding=utf-8"
    val prop = new Properties()
    prop.put("user", "root")
    prop.put("password", "Root-123456")
    df.write.mode(SaveMode.Append).jdbc(url, "USER_T", prop)
  }
}

In [None]:
dicc = {
"Monday":"10:00-21:00",
"Tuesday":"10:00-21:00"
}





In [None]:
Hours = []
for key in dicc.keys():
  if key == 'Monday':
    Hours.append([1,dicc[key].split('-')])
  if key == 'Tuesday':
    Hours.append([2,dicc[key].split('-')])
    dicc[key]
print(Hours)