In [1]:
from pyspark.sql import SparkSession

#Create the SparkSession using the postgres driver as a config
spark = SparkSession \
    .builder \
    .appName("Session-4") \
    .config("spark.jars", "postgresql-driver.jar") \
    .getOrCreate()

In [2]:
# Create a new dataframe using the postgres driver in order to access to db
inv_df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://3.85.128.234:8081/dvdrental") \
    .option("dbtable", "inventory") \
    .option("user", "myself") \
    .option("password", "mysecretpassword") \
    .option("driver", "org.postgresql.Driver") \
    .load()

inv_df.printSchema()

root
 |-- inventory_id: integer (nullable = true)
 |-- film_id: short (nullable = true)
 |-- store_id: short (nullable = true)
 |-- last_update: timestamp (nullable = true)



In [14]:
inv_df.show(5)

+------------+-------+--------+-------------------+
|inventory_id|film_id|store_id|        last_update|
+------------+-------+--------+-------------------+
|           1|      1|       1|2006-02-15 10:09:17|
|           2|      1|       1|2006-02-15 10:09:17|
|           3|      1|       1|2006-02-15 10:09:17|
|           4|      1|       1|2006-02-15 10:09:17|
|           5|      1|       2|2006-02-15 10:09:17|
+------------+-------+--------+-------------------+
only showing top 5 rows



In [17]:
film_df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://3.85.128.234:8081/dvdrental") \
    .option("dbtable", "film") \
    .option("user", "myself") \
    .option("password", "mysecretpassword") \
    .option("driver", "org.postgresql.Driver") \
    .load()

film_df.show(5)

+-------+-----------------+--------------------+------------+-----------+---------------+-----------+------+----------------+------+--------------------+--------------------+--------------------+
|film_id|            title|         description|release_year|language_id|rental_duration|rental_rate|length|replacement_cost|rating|         last_update|    special_features|            fulltext|
+-------+-----------------+--------------------+------------+-----------+---------------+-----------+------+----------------+------+--------------------+--------------------+--------------------+
|    133|  Chamber Italian|A Fateful Reflect...|        2006|          1|              7|       4.99|   117|           14.99| NC-17|2013-05-26 14:50:...|          [Trailers]|'chamber':1 'fate...|
|    384| Grosse Wonderful|A Epic Drama of a...|        2006|          1|              5|       4.99|    49|           19.99|     R|2013-05-26 14:50:...| [Behind the Scenes]|'australia':18 'c...|
|      8|  Airport P

In [41]:
# We create a dataframe with data in order to create a parquet file
parquet_data = [("James ","","Smith","36636","M",3000),
              ("Michael ","Rose","","40288","M",4000),
              ("Robert ","","Williams","42114","M",4000),
              ("Maria ","Anne","Jones","39192","F",4000),
              ("Jen","Mary","Brown","","F",-1)]

parquet_columns = ["firstname","middlename","lastname","dob","gender","salary"]

parquet_df = spark.createDataFrame(parquet_data, parquet_columns)

In [42]:
# Write the content of your dataframe in a parquet file
parquet_df.write.parquet("people.parquet")

In [43]:
# Read your parquet file and load it into a dataframe
people_df = spark.read.parquet("people.parquet")
people_df.show()

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|  dob|gender|salary|
+---------+----------+--------+-----+------+------+
|  Robert |          |Williams|42114|     M|  4000|
|   Maria |      Anne|   Jones|39192|     F|  4000|
|      Jen|      Mary|   Brown|     |     F|    -1|
|   James |          |   Smith|36636|     M|  3000|
| Michael |      Rose|        |40288|     M|  4000|
+---------+----------+--------+-----+------+------+



In [44]:
# Append in order to add more rows to the parquet

male_people_df = people_df.filter('gender == "M"')
male_people_df.write.mode('append').parquet("people.parquet")
m_df = spark.read.parquet("people.parquet")
m_df.show()


+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|  dob|gender|salary|
+---------+----------+--------+-----+------+------+
|  Robert |          |Williams|42114|     M|  4000|
|   Maria |      Anne|   Jones|39192|     F|  4000|
|      Jen|      Mary|   Brown|     |     F|    -1|
|  Robert |          |Williams|42114|     M|  4000|
|   James |          |   Smith|36636|     M|  3000|
| Michael |      Rose|        |40288|     M|  4000|
|   James |          |   Smith|36636|     M|  3000|
| Michael |      Rose|        |40288|     M|  4000|
+---------+----------+--------+-----+------+------+



In [50]:
# We can create temp tables

people_df.createOrReplaceTempView("ParquetTable")
sql_context = spark.sql("select * from ParquetTable where salary >= 4000 ")
sql_context.show()

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|  dob|gender|salary|
+---------+----------+--------+-----+------+------+
|  Robert |          |Williams|42114|     M|  4000|
|   Maria |      Anne|   Jones|39192|     F|  4000|
| Michael |      Rose|        |40288|     M|  4000|
+---------+----------+--------+-----+------+------+



In [51]:
# Also we can create the temp table directly into the parquet file

spark.sql("CREATE TEMPORARY VIEW PERSON USING parquet OPTIONS (path \"people.parquet\")")
spark.sql("SELECT * FROM PERSON").show()

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|  dob|gender|salary|
+---------+----------+--------+-----+------+------+
|  Robert |          |Williams|42114|     M|  4000|
|   Maria |      Anne|   Jones|39192|     F|  4000|
|      Jen|      Mary|   Brown|     |     F|    -1|
|  Robert |          |Williams|42114|     M|  4000|
|   James |          |   Smith|36636|     M|  3000|
| Michael |      Rose|        |40288|     M|  4000|
|   James |          |   Smith|36636|     M|  3000|
| Michael |      Rose|        |40288|     M|  4000|
+---------+----------+--------+-----+------+------+



In [53]:
# We can partition our tables in order to improve our query speed
people_df.write.partitionBy("gender","salary").mode("overwrite").parquet("people2.parquet")


In [54]:
# Retrieving from a parquet partition
male2_df = spark.read.parquet("people2.parquet/gender=M")
male2_df.show(truncate=False)

+---------+----------+--------+-----+------+
|firstname|middlename|lastname|dob  |salary|
+---------+----------+--------+-----+------+
|Robert   |          |Williams|42114|4000  |
|Michael  |Rose      |        |40288|4000  |
|James    |          |Smith   |36636|3000  |
+---------+----------+--------+-----+------+



In [55]:
# Creating a temp table from a partitioned parquet file

spark.sql("CREATE TEMPORARY VIEW PERSON2 USING parquet OPTIONS (path \"people2.parquet/gender=F\")")
spark.sql("SELECT * FROM PERSON2" ).show()

+---------+----------+--------+-----+------+
|firstname|middlename|lastname|  dob|salary|
+---------+----------+--------+-----+------+
|   Maria |      Anne|   Jones|39192|  4000|
|      Jen|      Mary|   Brown|     |    -1|
+---------+----------+--------+-----+------+



In [57]:
import pandas as pd    
data = [['Scott', 50], ['Jeff', 45], ['Thomas', 54],['Ann',34]] 
 
# Create the pandas DataFrame 
pandas_df = pd.DataFrame(data, columns = ['Name', 'Age']) 
  
# print dataframe. 
print(pandasDF)

     Name  Age
0   Scott   50
1    Jeff   45
2  Thomas   54
3     Ann   34


In [65]:
# Create PySpark DataFrame from Pandas
spark_pandas_df = spark.createDataFrame(pandas_df) 
spark_pandas_df.show()

+------+---+
|  Name|Age|
+------+---+
| Scott| 50|
|  Jeff| 45|
|Thomas| 54|
|   Ann| 34|
+------+---+



In [62]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Create User defined Custom Schema using StructType
my_schema = StructType([ StructField("First Name", StringType(), True)\
                       ,StructField("Age", IntegerType(), True)])


In [63]:
# Create a new dataframe using your new schema as paramater
pandas_schema_df = spark.createDataFrame(pandas_df,schema=my_schema)
pandas_schema_df.show()

+----------+---+
|First Name|Age|
+----------+---+
|     Scott| 50|
|      Jeff| 45|
|    Thomas| 54|
|       Ann| 34|
+----------+---+



In [64]:
pandas_schema_df.printSchema()

root
 |-- First Name: string (nullable = true)
 |-- Age: integer (nullable = true)



In [66]:
spark_pandas_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)



In [67]:
# Converting pandas df using apache arrow
spark.conf.set("spark.sql.execution.arrow.enabled","true")
arrow_df = spark.createDataFrame(pandas_df) 
arrow_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)



In [68]:
# Now convert a Spark DF to a Pandas DF
# First we create a new spark df
data = [("James","","Smith","36636","M",60000),
        ("Michael","Rose","","40288","M",70000),
        ("Robert","","Williams","42114","",400000),
        ("Maria","Anne","Jones","39192","F",500000),
        ("Jen","Mary","Brown","","F",0)]

columns = ["first_name","middle_name","last_name","dob","gender","salary"]
pyspark_df = spark.createDataFrame(data = data, schema = columns)
pyspark_df.printSchema()
pyspark_df.show(truncate=False)

root
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+----------+-----------+---------+-----+------+------+
|first_name|middle_name|last_name|dob  |gender|salary|
+----------+-----------+---------+-----+------+------+
|James     |           |Smith    |36636|M     |60000 |
|Michael   |Rose       |         |40288|M     |70000 |
|Robert    |           |Williams |42114|      |400000|
|Maria     |Anne       |Jones    |39192|F     |500000|
|Jen       |Mary       |Brown    |     |F     |0     |
+----------+-----------+---------+-----+------+------+



In [70]:
# Then use toPandas method
pandasDF = pyspark_df.toPandas()
print(pandasDF)

  first_name middle_name last_name    dob gender  salary
0      James                 Smith  36636      M   60000
1    Michael        Rose            40288      M   70000
2     Robert              Williams  42114         400000
3      Maria        Anne     Jones  39192      F  500000
4        Jen        Mary     Brown             F       0


In [72]:
# Nested structure elements
from pyspark.sql.types import StructType, StructField, StringType,IntegerType
data_struct = [(("James","","Smith"),"36636","M","3000"), \
      (("Michael","Rose",""),"40288","M","4000"), \
      (("Robert","","Williams"),"42114","M","4000"), \
      (("Maria","Anne","Jones"),"39192","F","4000"), \
      (("Jen","Mary","Brown"),"","F","-1") \
]

In [77]:
# Create the nested schema struct
schema_struct = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
          StructField('dob', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', StringType(), True)
         ])
nested_df = spark.createDataFrame(data=data_struct, schema = schema_struct)
nested_df.show()

+--------------------+-----+------+------+
|                name|  dob|gender|salary|
+--------------------+-----+------+------+
|    {James, , Smith}|36636|     M|  3000|
|   {Michael, Rose, }|40288|     M|  4000|
|{Robert, , Williams}|42114|     M|  4000|
|{Maria, Anne, Jones}|39192|     F|  4000|
|  {Jen, Mary, Brown}|     |     F|    -1|
+--------------------+-----+------+------+



In [76]:
# Convert to a Pandas DF
pandas_df2 = nested_df.toPandas()
print(pandas_df2)

                                                name    dob gender salary
0  {'firstname': 'James', 'middlename': '', 'last...  36636      M   3000
1  {'firstname': 'Michael', 'middlename': 'Rose',...  40288      M   4000
2  {'firstname': 'Robert', 'middlename': '', 'las...  42114      M   4000
3  {'firstname': 'Maria', 'middlename': 'Anne', '...  39192      F   4000
4  {'firstname': 'Jen', 'middlename': 'Mary', 'la...             F     -1


In [1]:
# Install dependencies
# !pip install pyarrow

# Pyarrow + Plasma

# The -m flag specifies the size of the store in bytes, and the -s flag specifies the socket that the store will listen at
#!plasma_store -m 1000000000 -s /tmp/plasma

/home/conda/feedstock_root/build_artifacts/arrow-cpp-ext_1644752432717/work/cpp/src/plasma/store.cc:1274: Allowing the Plasma store to use up to 1GB of memory.
/home/conda/feedstock_root/build_artifacts/arrow-cpp-ext_1644752432717/work/cpp/src/plasma/store.cc:1297: Starting object store with directory /dev/shm and huge page support disabled
^C


In [2]:
# Create a plasma client
import pyarrow.plasma as plasma
client = plasma.connect("/tmp/plasma")
client

<pyarrow._plasma.PlasmaClient at 0x7fb440259c30>

In [3]:
# Each object in the Plasma store should be associated with a unique ID
id = plasma.ObjectID(20 * b"a")
id

ObjectID(6161616161616161616161616161616161616161)

In [4]:
import numpy as np

def random_object_id():
  return plasma.ObjectID(np.random.bytes(20))

random_object_id()

ObjectID(dfd7d049d70fcbac27043d867e31a076672071e6)

In [5]:
# Create a python object.
object_id = client.put("hello, world")

# Get the object.
client.get(object_id)

'hello, world'

In [6]:
# You can also get multiple objects at the same time 
# Create multiple python objects.
object_id1 = client.put(1)
object_id2 = client.put(2)
object_id3 = client.put(3)

# Get the objects.
client.get([object_id1, object_id2, object_id3])

[1, 2, 3]

In [7]:
# Allocates a buffer for the object.
object_id = plasma.ObjectID(20 * b"a")
object_size = 1000
buffer = memoryview(client.create(object_id, object_size))

# Write to the buffer.
for i in range(1000):
  buffer[i] = i % 128

In [8]:
# Seal the object. This makes the object immutable and available to other clients.
client.seal(object_id)

In [10]:
# Create a different client. Note that this second client could be
# created in the same or in a separate, concurrent Python session.
client2 = plasma.connect("/tmp/plasma")

# Get the object in the second client. This blocks until the object has been sealed.
object_id2 = plasma.ObjectID(20 * b"a")
[buffer2] = client2.get_buffers([object_id])
buffer

<memory at 0x7fb42bafa1c0>

In [11]:
buffer2

<pyarrow.lib.Buffer at 0x7fb42bb02ab0>

In [12]:
buffer[1]

1

In [31]:
buffer[129]

1

In [15]:
view2 = memoryview(buffer2)

In [16]:
view2[1]

1

In [30]:
view2[129]

1

In [33]:
bytes(buffer[1:10])

b'\x01\x02\x03\x04\x05\x06\x07\x08\t'

In [32]:
bytes(view2[1:10])

b'\x01\x02\x03\x04\x05\x06\x07\x08\t'

In [34]:
import pyarrow.plasma as plasma
import time

client3 = plasma.connect("/tmp/plasma")

In [38]:
client.put("hello, world")
# Sleep a little so we get different creation times
time.sleep(2)
client.put("another object")
# Create an object that is not sealed yet
object_id = plasma.ObjectID.from_random()
client.create(object_id, 100)
client.list()


{ObjectID(d17e87d4d22f043d71ac2864893686da45d9f913): {'data_size': 576,
  'metadata_size': 0,
  'ref_count': 0,
  'create_time': 1645812740,
  'construct_duration': 0,
  'state': 'sealed'},
 ObjectID(a945d7e5349a81cef1d56209bed58989272d9675): {'data_size': 100,
  'metadata_size': 0,
  'ref_count': 1,
  'create_time': 1645812729,
  'construct_duration': -1,
  'state': 'created'},
 ObjectID(23c3747c644e3540c1440936e6516e8789bd1961): {'data_size': 576,
  'metadata_size': 0,
  'ref_count': 0,
  'create_time': 1645812729,
  'construct_duration': 0,
  'state': 'sealed'},
 ObjectID(f56ab559bdc9c2abe047ff6ed6aba41a54a4b49f): {'data_size': 576,
  'metadata_size': 0,
  'ref_count': 0,
  'create_time': 1645812698,
  'construct_duration': 0,
  'state': 'sealed'},
 ObjectID(a7133fac3713a8ee9b68fe327499f014fbf5878b): {'data_size': 576,
  'metadata_size': 0,
  'ref_count': 0,
  'create_time': 1645812882,
  'construct_duration': 0,
  'state': 'sealed'},
 ObjectID(eb89c66279561479965fae7013bc1c2ff597a4

In [49]:
import numpy as np
import pyarrow as pa

# Create a pyarrow.Tensor object from a numpy random 2-dimensional array
data = np.random.randn(10, 4)
data[4]

array([ 0.2635016 , -0.39320425,  0.74859679, -1.31778029])

In [46]:
tensor = pa.Tensor.from_numpy(data)
tensor

<pyarrow.Tensor>
type: double
shape: (10, 4)
strides: (32, 8)

In [50]:
# Create the object in Plasma
object_id = plasma.ObjectID(np.random.bytes(20))
data_size = pa.ipc.get_tensor_size(tensor)
buf = client.create(object_id, data_size)

In [51]:
# Write the tensor into the Plasma-allocated buffer
stream = pa.FixedSizeBufferWriter(buf)
pa.ipc.write_tensor(tensor, stream)  # Writes tensor's 552 bytes to Plasma stream

576

In [52]:
# Seal the Plasma object
client.seal(object_id)

In [58]:
# Get the arrow object by ObjectID.
[buf2] = client.get_buffers([object_id])
buf2

<pyarrow.lib.Buffer at 0x7fb42bb0a5f0>

In [57]:
# Reconstruct the Arrow tensor object.
reader = pa.BufferReader(buf2)
tensor2 = pa.ipc.read_tensor(reader)
tensor2

<pyarrow.Tensor>
type: double
shape: (10, 4)
strides: (32, 8)

In [56]:
# Convert back to numpy
array = tensor2.to_numpy()
array

array([[-1.55267097,  0.45403492,  0.61712591, -0.15753051],
       [-2.44991476,  0.97311571, -0.24487969, -0.89297473],
       [ 1.2597042 ,  1.42905181,  0.3302586 ,  0.02979372],
       [-0.01713524, -1.29416298,  0.92401369, -0.29975209],
       [ 0.61527264,  1.444473  ,  0.03533133, -0.77023468],
       [ 1.05767962,  1.40729435,  1.67217589,  0.25303491],
       [ 0.81223315,  0.24885553,  0.60027239, -0.5533307 ],
       [ 1.33607181, -0.05753372,  1.33274268, -0.91863381],
       [-0.1925225 , -1.49456125, -0.12463497, -0.32851444],
       [ 0.34957346, -0.57476307,  1.59733538,  0.64178924]])

In [61]:
# Storing pandas in plasma
import pandas as pd

# Create a Pandas DataFrame
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [62]:
# Convert the Pandas DataFrame into a PyArrow RecordBatch
record_batch = pa.RecordBatch.from_pandas(df)
record_batch

pyarrow.RecordBatch
one: double
two: double
__index_level_0__: string

In [64]:
# Create the Plasma object from the PyArrow RecordBatch. Most of the work here
# is done to determine the size of buffer to request from the object store.
object_id = plasma.ObjectID(np.random.bytes(20))
mock_sink = pa.MockOutputStream()
with pa.RecordBatchStreamWriter(mock_sink, record_batch.schema) as stream_writer:
    stream_writer.write_batch(record_batch)
data_size = mock_sink.size()
buf = client.create(object_id, data_size)
buf

<pyarrow._plasma.PlasmaBuffer at 0x7fb3ef444fa0>

In [65]:
# Write the PyArrow RecordBatch to Plasma
stream = pa.FixedSizeBufferWriter(buf)
with pa.RecordBatchStreamWriter(stream, record_batch.schema) as stream_writer:
    stream_writer.write_batch(record_batch)

In [66]:
# Seal the Plasma object
client.seal(object_id)

In [69]:
# Fetch the Plasma object
[data] = client.get_buffers([object_id])  # Get PlasmaBuffer from ObjectID
buffer = pa.BufferReader(data)
buffer

<pyarrow.lib.BufferReader at 0x7fb3ef39c2c0>

In [70]:
# Convert object back into an Arrow RecordBatch
reader = pa.RecordBatchStreamReader(buffer)
record_batch = reader.read_next_batch()
record_batch

pyarrow.RecordBatch
one: double
two: double
__index_level_0__: string

In [120]:
# Convert back into Pandas
result = record_batch.to_pandas()
result

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [129]:
# Load datasets as a chunk
data = pd.read_csv('train_dataset.csv', chunksize=100, dtype={'Age': 'int32'})
data

<pandas.io.parsers.readers.TextFileReader at 0x7fb3deabec70>

In [131]:
full_data = pd.DataFrame()
for x in data:
    full_data = pd.concat([full_data, x])
    print(x.shape)

(100, 8)
(100, 8)
(100, 8)
(100, 8)
(100, 8)
(100, 8)
(9, 8)


In [133]:
full_data

Unnamed: 0,Gender,Age,openness,neuroticism,conscientiousness,agreeableness,extraversion,Personality (Class label)
100,Male,18,5,3,3,6,2,responsible
101,Female,19,6,5,2,3,3,lively
102,Male,27,7,5,6,7,3,dependable
103,Male,17,6,5,4,7,5,dependable
104,Female,21,4,6,3,5,2,lively
...,...,...,...,...,...,...,...,...
704,Female,20,4,5,6,6,1,responsible
705,Male,18,6,3,1,5,5,dependable
706,Male,22,5,2,3,6,1,serious
707,Male,19,5,6,5,7,5,extraverted


In [106]:
reader = pd.read_csv('train_dataset.csv', iterator=True)
data2 = reader.get_chunk(100)
data2

Unnamed: 0,Gender,Age,openness,neuroticism,conscientiousness,agreeableness,extraversion,Personality (Class label)
0,Male,17,7,4,7,3,2,extraverted
1,Male,19,4,5,4,6,6,serious
2,Female,18,7,6,4,5,5,dependable
3,Female,22,5,6,7,4,3,extraverted
4,Female,19,7,4,6,5,4,lively
...,...,...,...,...,...,...,...,...
95,Male,18,3,6,6,1,3,responsible
96,Female,18,4,4,5,7,3,dependable
97,Female,19,5,5,4,6,3,lively
98,Male,19,2,5,3,6,4,serious


In [134]:
def generate_chunk(reader, chunk_size):
    while reader:
        yield reader.get_chunk(chunk_size)

In [137]:
dataset = generate_chunk(reader, 100)
dataset

<generator object generate_chunk at 0x7fb3de833890>

In [139]:
chunk2 = next(dataset)
chunk2

Unnamed: 0,Gender,Age,openness,neuroticism,conscientiousness,agreeableness,extraversion,Personality (Class label)
100,Male,18,5,3,3,6,2,responsible
101,Female,19,6,5,2,3,3,lively
102,Male,27,7,5,6,7,3,dependable
103,Male,17,6,5,4,7,5,dependable
104,Female,21,4,6,3,5,2,lively
...,...,...,...,...,...,...,...,...
195,Male,19,4,5,5,7,5,extraverted
196,Female,18,5,3,1,5,5,extraverted
197,Female,18,2,7,1,4,4,responsible
198,Male,20,6,6,1,2,2,lively


In [141]:
def infinite_loop():
    i = 0
    while True:
        yield i
        i += 1

In [145]:
inf = infinite_loop()
print(next(inf))
print(next(inf))
print(next(inf))

0
1
2


In [146]:
while next(inf) < 10:
    print('Less than 10')

Less than 10
Less than 10
Less than 10
Less than 10
Less than 10
Less than 10
Less than 10


In [147]:
nums_squared_lc = [num ** 2 for num in range(5)]
nums_squared_gc = (i ** 2 for i in range(10000))

In [148]:
import sys
# print(sys.getsizeof(nums_squared_lc))
# print(sys.getsizeof(nums_squared_gc))