### Examples Of Pyspark ML

#### INICIAMOS SESION

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('dataframe').getOrCreate()

In [2]:
spark

In [43]:
sc = spark.sparkContext

# using SQLContext to read parquet file
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

# to read parquet file
products = sqlContext.read.parquet('ejercicio/data_technical_assessment/products.parquet')



In [48]:
dff=products.toPandas()

In [50]:
dff

Unnamed: 0,ProductRootCode,ProductId,Family,SupplierPrice,RetailPrice
0,1,2,BRACELET,36.020000,180.0
1,3,4,BRACELET,41.430000,220.0
2,5,6,BRACELET,53.580002,230.0
3,7,8,BRACELET,62.349998,250.0
4,9,10,BRACELET,0.640000,59.0
...,...,...,...,...,...
9342,16589,16593,RING,4.310000,45.0
9343,16594,16595,NECKLACE,3.820000,35.0
9344,16596,16597,EARRINGS,3.300000,29.0
9345,16598,16599,BRACELET,5.670000,39.0


CARGAMOS CSV

In [3]:
training = spark.read.csv('data/test1.csv', header=True, inferSchema=True)

In [4]:
training.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



En PySpark se trabaja de forma diferente. Tendremos que agrupar nuestras variables independientes de forma que queden todas en una columna y dentro de una lista, por lo que crearemos un vector de ensamblaje o "vector assembler", de tal modo que queden así esas variables independientes:
- [Age, Experience]

Lo que haremos con estas dos, será tratarlas como una nueva variable independiente:
- [Age, Experience] ----> nueva_variable_independiente

In [5]:
from pyspark.ml.feature import VectorAssembler 

In [6]:
feature_assembler = VectorAssembler(inputCols=['age', 'Experience'], outputCol='Independent features') # 1:46:27 del vídeo

In [8]:
output = feature_assembler.transform(training)


Veremos que se crea una nueva columna cuyos valores se corresponden a unos array con el contenido de aquellas variables independientes que hemos agrupado. Esto será nuestro input feature o lo que solíamos definir como train.

In [9]:
output.show()

+---------+---+----------+------+--------------------+
|     Name|age|Experience|Salary|Independent features|
+---------+---+----------+------+--------------------+
|    Krish| 31|        10| 30000|         [31.0,10.0]|
|Sudhanshu| 30|         8| 25000|          [30.0,8.0]|
|    Sunny| 29|         4| 20000|          [29.0,4.0]|
|     Paul| 24|         3| 20000|          [24.0,3.0]|
|   Harsha| 21|         1| 15000|          [21.0,1.0]|
|  Shubham| 23|         2| 18000|          [23.0,2.0]|
+---------+---+----------+------+--------------------+



Seleccionamos las columnas que nos interesan para nuestro modelo: el train (Independent Features) y el test (Salary)

In [10]:
finalized_data = output.select('Independent features', 'Salary')
finalized_data.show()

+--------------------+------+
|Independent features|Salary|
+--------------------+------+
|         [31.0,10.0]| 30000|
|          [30.0,8.0]| 25000|
|          [29.0,4.0]| 20000|
|          [24.0,3.0]| 20000|
|          [21.0,1.0]| 15000|
|          [23.0,2.0]| 18000|
+--------------------+------+



A continuación, entrenaremos un modelo de regresión lineal.

In [42]:
import pyspark.ml.regression as psc

In [11]:
from pyspark.ml.regression import LinearRegression

In [29]:
train, test = finalized_data.randomSplit([0.75, 0.25], seed=42)
regressor = LinearRegression(featuresCol='Independent features', labelCol='Salary')
regressor = regressor.fit(train)

In [30]:
train.show()

+--------------------+------+
|Independent features|Salary|
+--------------------+------+
|          [21.0,1.0]| 15000|
|          [23.0,2.0]| 18000|
|          [29.0,4.0]| 20000|
|          [30.0,8.0]| 25000|
|         [31.0,10.0]| 30000|
+--------------------+------+



In [31]:
test.show()

+--------------------+------+
|Independent features|Salary|
+--------------------+------+
|          [24.0,3.0]| 20000|
+--------------------+------+



In [32]:
regressor.coefficients

DenseVector([-64.8464, 1584.7554])

In [33]:
regressor.intercept

15414.10693970376

In [34]:
test.show()

+--------------------+------+
|Independent features|Salary|
+--------------------+------+
|          [24.0,3.0]| 20000|
+--------------------+------+



In [35]:
prediction = regressor.evaluate(test)

In [36]:
prediction.predictions.show()

+--------------------+------+------------------+
|Independent features|Salary|        prediction|
+--------------------+------+------------------+
|          [24.0,3.0]| 20000|18612.059158134223|
+--------------------+------+------------------+



In [37]:
# Errores

prediction.meanAbsoluteError, prediction.meanSquaredError

(1387.9408418657767, 1926379.780519081)

In [20]:
4200+500+7200


11900

In [22]:
12000/3

4000.0

In [51]:
!pip install tweepy

Collecting tweepy
  Downloading tweepy-4.10.0-py3-none-any.whl (94 kB)
     -------------------------------------- 94.4/94.4 kB 772.5 kB/s eta 0:00:00
Collecting requests<3,>=2.27.0
  Using cached requests-2.28.1-py3-none-any.whl (62 kB)
Collecting requests-oauthlib<2,>=1.2.0
  Using cached requests_oauthlib-1.3.1-py2.py3-none-any.whl (23 kB)
Collecting oauthlib<4,>=3.2.0
  Using cached oauthlib-3.2.0-py3-none-any.whl (151 kB)
Collecting urllib3<1.27,>=1.21.1
  Using cached urllib3-1.26.10-py2.py3-none-any.whl (139 kB)
Collecting idna<4,>=2.5
  Using cached idna-3.3-py3-none-any.whl (61 kB)
Collecting charset-normalizer<3,>=2
  Using cached charset_normalizer-2.1.0-py3-none-any.whl (39 kB)
Installing collected packages: urllib3, oauthlib, idna, charset-normalizer, requests, requests-oauthlib, tweepy
Successfully installed charset-normalizer-2.1.0 idna-3.3 oauthlib-3.2.0 requests-2.28.1 requests-oauthlib-1.3.1 tweepy-4.10.0 urllib3-1.26.10


In [None]:
# 'consumer_key':'k4Cb7sjrYKb9biOMWOBJTJukP', #add your consumer key
#         'consumer_secret':'Qb0mya6kC9OUtIdqyDFWz5gCw4WGFF4jtJ8LkzTsUQmGdZVbe0',

In [85]:
import tweepy
consumer_token = "vBsf316MsywHcg0SxAm2exqEr"
consumer_secret = "KHJxQtSV0cyqM1VFZclJ4GmtAFkejKbNQGYkpHEx8ilB5j43EC"

consumer_token = "k4Cb7sjrYKb9biOMWOBJTJukP" #isma
consumer_secret = "Qb0mya6kC9OUtIdqyDFWz5gCw4WGFF4jtJ8LkzTsUQmGdZVbe0" #isma

auth = tweepy.AppAuthHandler(consumer_token, consumer_secret)
auth.secure = True
api = tweepy.API(auth)

In [90]:
searchQuery = '@TheBridge_Tech'
retweet_filter='-filter:retweets'
sinceId = None

In [91]:
q=searchQuery+retweet_filter

In [92]:
tweetsPerQry = 1000
fName = 'tweets.txt'

In [76]:
!pip install jsonpickle

Collecting jsonpickle
  Downloading jsonpickle-2.2.0-py2.py3-none-any.whl (39 kB)
Installing collected packages: jsonpickle
Successfully installed jsonpickle-2.2.0


In [89]:
import jsonpickle

In [93]:
max_id = -1
maxTweets = 10000000 

tweetCount = 0
print("Downloading max {0} tweets".format(maxTweets))
with open(fName, 'w') as f:
    while tweetCount < maxTweets:
        
            if (max_id <= 0):
                if (not sinceId):
                    new_tweets = api.search_tweets(q=searchQuery, count=tweetsPerQry)
                else:
                    new_tweets = api.search_tweets(q=searchQuery, count=tweetsPerQry,
                                            since_id=sinceId)
            else:
                if (not sinceId):
                    new_tweets = api.search_tweets(q=searchQuery, count=tweetsPerQry,
                                            max_id=str(max_id - 1))
                else:
                    new_tweets = api.search_tweets(q=searchQuery, count=tweetsPerQry,
                                            max_id=str(max_id - 1),
                                            since_id=sinceId)
            if not new_tweets:
                print("No more tweets found")
                break
            for tweet in new_tweets:
                f.write(jsonpickle.encode(tweet._json, unpicklable=False) +
                        '\n')
            tweetCount += len(new_tweets)
            print("Downloaded {0} tweets".format(tweetCount))
            max_id = new_tweets[-1].id


print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))

Downloading max 10000000 tweets
Downloaded 15 tweets
No more tweets found
Downloaded 15 tweets, Saved to tweets.txt


In [79]:
t = {"created_at": "Wed Jul 13 13:45:42 +0000 2022", "id": 1547215694651162625, "id_str": "1547215694651162625", "text": "RT @InesCalabuig2: Hoy hemos tenido una s\u00faper visita a nuestras oficinas @TheBridge_Tech\ud83d\udc49primera Aceleradora de Talento Digital. \nHemos pod\u2026", "truncated": false, "entities": {"hashtags": [], "symbols": [], "user_mentions": [{"screen_name": "InesCalabuig2", "name": "Ines Calabuig", "id": 1232276995796586499, "id_str": "1232276995796586499", "indices": [3, 17]}, {"screen_name": "TheBridge_Tech", "name": "The Bridge", "id": 1162694149956603904, "id_str": "1162694149956603904", "indices": [73, 88]}], "urls": []}, "metadata": {"iso_language_code": "es", "result_type": "recent"}, "source": "<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>", "in_reply_to_status_id": null, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "in_reply_to_user_id_str": null, "in_reply_to_screen_name": null, "user": {"id": 1162694149956603904, "id_str": "1162694149956603904", "name": "The Bridge", "screen_name": "TheBridge_Tech", "location": "Paseo de Recoletos, 15, Madrid", "description": "Aceleradora de Talento Digital\n\nBootcamps | Corporate Skilling | University Program Management | Startup Factory", "url": "https://t.co/N87RbsGawk", "entities": {"url": {"urls": [{"url": "https://t.co/N87RbsGawk", "expanded_url": "https://www.thebridge.tech/", "display_url": "thebridge.tech", "indices": [0, 23]}]}, "description": {"urls": []}}, "protected": false, "followers_count": 489, "friends_count": 119, "listed_count": 10, "created_at": "Sat Aug 17 11:54:55 +0000 2019", "favourites_count": 177, "utc_offset": null, "time_zone": null, "geo_enabled": true, "verified": false, "statuses_count": 488, "lang": null, "contributors_enabled": false, "is_translator": false, "is_translation_enabled": false, "profile_background_color": "F5F8FA", "profile_background_image_url": null, "profile_background_image_url_https": null, "profile_background_tile": false, "profile_image_url": "http://pbs.twimg.com/profile_images/1546415533909737474/UgLpusY5_normal.jpg", "profile_image_url_https": "https://pbs.twimg.com/profile_images/1546415533909737474/UgLpusY5_normal.jpg", "profile_banner_url": "https://pbs.twimg.com/profile_banners/1162694149956603904/1631868724", "profile_link_color": "1DA1F2", "profile_sidebar_border_color": "C0DEED", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "profile_use_background_image": true, "has_extended_profile": false, "default_profile": true, "default_profile_image": false, "following": null, "follow_request_sent": null, "notifications": null, "translator_type": "none", "withheld_in_countries": []}, "geo": null, "coordinates": null, "place": null, "contributors": null, "retweeted_status": {"created_at": "Wed Jul 13 13:22:40 +0000 2022", "id": 1547209900874506241, "id_str": "1547209900874506241", "text": "Hoy hemos tenido una s\u00faper visita a nuestras oficinas @TheBridge_Tech\ud83d\udc49primera Aceleradora de Talento Digital. \nHemo\u2026 https://t.co/jwKSElvMn1", "truncated": true, "entities": {"hashtags": [], "symbols": [], "user_mentions": [{"screen_name": "TheBridge_Tech", "name": "The Bridge", "id": 1162694149956603904, "id_str": "1162694149956603904", "indices": [54, 69]}], "urls": [{"url": "https://t.co/jwKSElvMn1", "expanded_url": "https://twitter.com/i/web/status/1547209900874506241", "display_url": "twitter.com/i/web/status/1\u2026", "indices": [117, 140]}]}, "metadata": {"iso_language_code": "es", "result_type": "recent"}, "source": "<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>", "in_reply_to_status_id": null, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "in_reply_to_user_id_str": null, "in_reply_to_screen_name": null, "user": {"id": 1232276995796586499, "id_str": "1232276995796586499", "name": "Ines Calabuig", "screen_name": "InesCalabuig2", "location": "", "description": "ESG Head @GoHubVentures\nInvesting in #DeepTech startups to build more digital and efficient industries. #Tech4Good", "url": null, "entities": {"description": {"urls": []}}, "protected": false, "followers_count": 259, "friends_count": 548, "listed_count": 2, "created_at": "Tue Feb 25 12:11:53 +0000 2020", "favourites_count": 2526, "utc_offset": null, "time_zone": null, "geo_enabled": false, "verified": false, "statuses_count": 183, "lang": null, "contributors_enabled": false, "is_translator": false, "is_translation_enabled": false, "profile_background_color": "F5F8FA", "profile_background_image_url": null, "profile_background_image_url_https": null, "profile_background_tile": false, "profile_image_url": "http://pbs.twimg.com/profile_images/1373973790493970432/5TRNtvSO_normal.jpg", "profile_image_url_https": "https://pbs.twimg.com/profile_images/1373973790493970432/5TRNtvSO_normal.jpg", "profile_banner_url": "https://pbs.twimg.com/profile_banners/1232276995796586499/1616416722", "profile_link_color": "1DA1F2", "profile_sidebar_border_color": "C0DEED", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "profile_use_background_image": true, "has_extended_profile": false, "default_profile": true, "default_profile_image": false, "following": null, "follow_request_sent": null, "notifications": null, "translator_type": "none", "withheld_in_countries": []}, "geo": null, "coordinates": null, "place": null, "contributors": null, "is_quote_status": false, "retweet_count": 1, "favorite_count": 3, "favorited": false, "retweeted": false, "possibly_sensitive": false, "lang": "es"}, "is_quote_status": false, "retweet_count": 1, "favorite_count": 0, "favorited": false, "retweeted": false, "lang": "es"}


NameError: name 'false' is not defined

In [81]:
a = """{"created_at": "Wed Jul 13 13:45:42 +0000 2022", "id": 1547215694651162625, "id_str": "1547215694651162625", "text": "RT @InesCalabuig2: Hoy hemos tenido una s\u00faper visita a nuestras oficinas @TheBridge_Tech\ud83d\udc49primera Aceleradora de Talento Digital. \nHemos pod\u2026", "truncated": false, "entities": {"hashtags": [], "symbols": [], "user_mentions": [{"screen_name": "InesCalabuig2", "name": "Ines Calabuig", "id": 1232276995796586499, "id_str": "1232276995796586499", "indices": [3, 17]}, {"screen_name": "TheBridge_Tech", "name": "The Bridge", "id": 1162694149956603904, "id_str": "1162694149956603904", "indices": [73, 88]}], "urls": []}, "metadata": {"iso_language_code": "es", "result_type": "recent"}, "source": "<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>", "in_reply_to_status_id": null, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "in_reply_to_user_id_str": null, "in_reply_to_screen_name": null, "user": {"id": 1162694149956603904, "id_str": "1162694149956603904", "name": "The Bridge", "screen_name": "TheBridge_Tech", "location": "Paseo de Recoletos, 15, Madrid", "description": "Aceleradora de Talento Digital\n\nBootcamps | Corporate Skilling | University Program Management | Startup Factory", "url": "https://t.co/N87RbsGawk", "entities": {"url": {"urls": [{"url": "https://t.co/N87RbsGawk", "expanded_url": "https://www.thebridge.tech/", "display_url": "thebridge.tech", "indices": [0, 23]}]}, "description": {"urls": []}}, "protected": false, "followers_count": 489, "friends_count": 119, "listed_count": 10, "created_at": "Sat Aug 17 11:54:55 +0000 2019", "favourites_count": 177, "utc_offset": null, "time_zone": null, "geo_enabled": true, "verified": false, "statuses_count": 488, "lang": null, "contributors_enabled": false, "is_translator": false, "is_translation_enabled": false, "profile_background_color": "F5F8FA", "profile_background_image_url": null, "profile_background_image_url_https": null, "profile_background_tile": false, "profile_image_url": "http://pbs.twimg.com/profile_images/1546415533909737474/UgLpusY5_normal.jpg", "profile_image_url_https": "https://pbs.twimg.com/profile_images/1546415533909737474/UgLpusY5_normal.jpg", "profile_banner_url": "https://pbs.twimg.com/profile_banners/1162694149956603904/1631868724", "profile_link_color": "1DA1F2", "profile_sidebar_border_color": "C0DEED", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "profile_use_background_image": true, "has_extended_profile": false, "default_profile": true, "default_profile_image": false, "following": null, "follow_request_sent": null, "notifications": null, "translator_type": "none", "withheld_in_countries": []}, "geo": null, "coordinates": null, "place": null, "contributors": null, "retweeted_status": {"created_at": "Wed Jul 13 13:22:40 +0000 2022", "id": 1547209900874506241, "id_str": "1547209900874506241", "text": "Hoy hemos tenido una s\u00faper visita a nuestras oficinas @TheBridge_Tech\ud83d\udc49primera Aceleradora de Talento Digital. \nHemo\u2026 https://t.co/jwKSElvMn1", "truncated": true, "entities": {"hashtags": [], "symbols": [], "user_mentions": [{"screen_name": "TheBridge_Tech", "name": "The Bridge", "id": 1162694149956603904, "id_str": "1162694149956603904", "indices": [54, 69]}], "urls": [{"url": "https://t.co/jwKSElvMn1", "expanded_url": "https://twitter.com/i/web/status/1547209900874506241", "display_url": "twitter.com/i/web/status/1\u2026", "indices": [117, 140]}]}, "metadata": {"iso_language_code": "es", "result_type": "recent"}, "source": "<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>", "in_reply_to_status_id": null, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "in_reply_to_user_id_str": null, "in_reply_to_screen_name": null, "user": {"id": 1232276995796586499, "id_str": "1232276995796586499", "name": "Ines Calabuig", "screen_name": "InesCalabuig2", "location": "", "description": "ESG Head @GoHubVentures\nInvesting in #DeepTech startups to build more digital and efficient industries. #Tech4Good", "url": null, "entities": {"description": {"urls": []}}, "protected": false, "followers_count": 259, "friends_count": 548, "listed_count": 2, "created_at": "Tue Feb 25 12:11:53 +0000 2020", "favourites_count": 2526, "utc_offset": null, "time_zone": null, "geo_enabled": false, "verified": false, "statuses_count": 183, "lang": null, "contributors_enabled": false, "is_translator": false, "is_translation_enabled": false, "profile_background_color": "F5F8FA", "profile_background_image_url": null, "profile_background_image_url_https": null, "profile_background_tile": false, "profile_image_url": "http://pbs.twimg.com/profile_images/1373973790493970432/5TRNtvSO_normal.jpg", "profile_image_url_https": "https://pbs.twimg.com/profile_images/1373973790493970432/5TRNtvSO_normal.jpg", "profile_banner_url": "https://pbs.twimg.com/profile_banners/1232276995796586499/1616416722", "profile_link_color": "1DA1F2", "profile_sidebar_border_color": "C0DEED", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "profile_use_background_image": true, "has_extended_profile": false, "default_profile": true, "default_profile_image": false, "following": null, "follow_request_sent": null, "notifications": null, "translator_type": "none", "withheld_in_countries": []}, "geo": null, "coordinates": null, "place": null, "contributors": null, "is_quote_status": false, "retweet_count": 1, "favorite_count": 3, "favorited": false, "retweeted": false, "possibly_sensitive": false, "lang": "es"}, "is_quote_status": false, "retweet_count": 1, "favorite_count": 0, "favorited": false, "retweeted": false, "lang": "es"}
"""

In [82]:
import json

In [84]:
json.dumps(a)

'"{\\"created_at\\": \\"Wed Jul 13 13:45:42 +0000 2022\\", \\"id\\": 1547215694651162625, \\"id_str\\": \\"1547215694651162625\\", \\"text\\": \\"RT @InesCalabuig2: Hoy hemos tenido una s\\u00faper visita a nuestras oficinas @TheBridge_Tech\\ud83d\\udc49primera Aceleradora de Talento Digital. \\nHemos pod\\u2026\\", \\"truncated\\": false, \\"entities\\": {\\"hashtags\\": [], \\"symbols\\": [], \\"user_mentions\\": [{\\"screen_name\\": \\"InesCalabuig2\\", \\"name\\": \\"Ines Calabuig\\", \\"id\\": 1232276995796586499, \\"id_str\\": \\"1232276995796586499\\", \\"indices\\": [3, 17]}, {\\"screen_name\\": \\"TheBridge_Tech\\", \\"name\\": \\"The Bridge\\", \\"id\\": 1162694149956603904, \\"id_str\\": \\"1162694149956603904\\", \\"indices\\": [73, 88]}], \\"urls\\": []}, \\"metadata\\": {\\"iso_language_code\\": \\"es\\", \\"result_type\\": \\"recent\\"}, \\"source\\": \\"<a href=\\"https://mobile.twitter.com\\" rel=\\"nofollow\\">Twitter Web App</a>\\", \\"in_reply_to_status_id\\": null,

In [94]:
!pip install twint

Collecting twint
  Downloading twint-2.1.20.tar.gz (31 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp38-cp38-win_amd64.whl (555 kB)
     -------------------------------------- 556.0/556.0 kB 1.3 MB/s eta 0:00:00
Collecting aiodns
  Downloading aiodns-3.0.0-py3-none-any.whl (5.0 kB)
Collecting cchardet
  Downloading cchardet-2.1.7-cp38-cp38-win_amd64.whl (115 kB)
     -------------------------------------- 115.2/115.2 kB 1.7 MB/s eta 0:00:00
Collecting elasticsearch
  Downloading elasticsearch-8.3.1-py3-none-any.whl (382 kB)
     -------------------------------------- 382.5/382.5 kB 1.6 MB/s eta 0:00:00
Collecting pysocks
  Using cached PySocks-1.7.1-py3-none-any.whl (16 kB)
Collecting aiohttp_socks
  Downloading aiohttp_socks-0.7.1-py3-none-any.whl (9.3 kB)
Collecting schedule
  Downloading schedule-1.1.0-py2.py3-none-any.whl (10 kB)
Collecting geopy
  Using cached geopy-2.2.0-p

In [96]:
!pip install nest_asyncio



In [1]:
1+1

2

In [34]:
import twint
import nest_asyncio

# utiliza la libreria de TWINT para escrappear twitter y obtener la informacion deseada

nest_asyncio.apply()
# Configuracion
c = twint.Config()
c.Search = '@TheBridge_Tech'
# Guardar en CSV
c.Store_csv = True
c.Output = './data/raw_tweets.csv'
# especificar fecha
c.Since = '2022-03-01 00:00:00'
c.Until = '2022-04-01 00:00:00'
c.Hide_output = True



In [35]:

# Run
try:
    twint.run.Search(c)
except Exception as e:
    print(e)

[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.


In [36]:
pd.read_csv('./data/raw_tweets.csv').drop_duplicates()

Unnamed: 0,1509622664297693189,1509622664297693189.1,2022-03-31 22:04:25 Hora de verano romance,2022-03-31,22:04:25,+0200,1162694149956603904,thebridge_tech,The Bridge,Unnamed: 9,...,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,[].3,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35
0,1509126320386715655,1509126320386715655,2022-03-30 13:12:07 Hora de verano romance,2022-03-30,13:12:07,200,1402182570314539009,thefringelabs,TheFringe/LABS,,...,,,,,,[],,,,
1,1509108619782111235,1509108619782111235,2022-03-30 12:01:47 Hora de verano romance,2022-03-30,12:01:47,200,1162694149956603904,thebridge_tech,The Bridge,,...,,,,,,[],,,,
2,1509101924699803650,1509101924699803650,2022-03-30 11:35:11 Hora de verano romance,2022-03-30,11:35:11,200,1197839987867373568,vocentoeventos,vocentoEventos,,...,,,,,,[],,,,
3,1509074925654532099,1509074925654532099,2022-03-30 09:47:54 Hora de verano romance,2022-03-30,09:47:54,200,1162694149956603904,thebridge_tech,The Bridge,,...,,,,,,[],,,,
4,1508843117000249345,1508828414920253443,2022-03-29 18:26:46 Hora de verano romance,2022-03-29,18:26:46,200,1197839987867373568,vocentoeventos,vocentoEventos,,...,,,,,,[],,,,
5,1507711509467734019,1507711509467734019,2022-03-26 14:30:10 Hora estándar romance,2022-03-26,14:30:10,200,41545659,nodosenlared,Mario Lopez de Avila,,...,,,,,,[],,,,
6,1507059034146361344,1507059034146361344,2022-03-24 19:17:28 Hora estándar romance,2022-03-24,19:17:28,200,163586106,encamarasevilla,Escuela de Negocios,,...,,,,,,[],,,,
7,1507029045187387401,1507029045187387401,2022-03-24 17:18:18 Hora estándar romance,2022-03-24,17:18:18,200,1162694149956603904,thebridge_tech,The Bridge,,...,,,,,,[],,,,
8,1506941441574117378,1506941441574117378,2022-03-24 11:30:11 Hora estándar romance,2022-03-24,11:30:11,200,1162694149956603904,thebridge_tech,The Bridge,,...,,,,,,[],,,,
9,1504734193401618459,1504734193401618459,2022-03-18 09:19:22 Hora estándar romance,2022-03-18,09:19:22,200,1162694149956603904,thebridge_tech,The Bridge,,...,,,,,,[],,,,


In [37]:
import pandas as pd

In [21]:
pd.read_csv('./data/raw_tweets.csv').drop_duplicates()

Unnamed: 0,1547230643561664515,1547230643561664515.1,2022-07-13 16:45:06 Hora de verano romance,2022-07-13,16:45:06,+0200,1088261225153060864,gohubventures,GoHub Ventures,Unnamed: 9,...,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,[].2,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35
0,1547209900874506241,1547209900874506241,2022-07-13 15:22:40 Hora de verano romance,2022-07-13,15:22:40,200,1232276995796586499,inescalabuig2,Ines Calabuig,,...,,,,,,[],,,,
1,1546817334882476032,1546817334882476032,2022-07-12 13:22:45 Hora de verano romance,2022-07-12,13:22:45,200,1162694149956603904,thebridge_tech,The Bridge,,...,,,,,,[],,,,
2,1546397466517540866,1546397466517540866,2022-07-11 09:34:21 Hora de verano romance,2022-07-11,09:34:21,200,1085383992,wcapitalriesgo,Webcapitalriesgo.com,,...,,,,,,[],,,,
3,1544988613129261056,1544988613129261056,2022-07-07 12:16:04 Hora de verano romance,2022-07-07,12:16:04,200,27890913,elreferente,El Referente,,...,,,,,,[],,,,
4,1544595001086283778,1544595001086283778,2022-07-06 10:12:00 Hora de verano romance,2022-07-06,10:12:00,200,190658967,haycanal,hayCANAL.com,,...,,,,,,[],,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,1538882376138477569,1538882376138477569,2022-06-20 15:52:04 Hora de verano romance,2022-06-20,15:52:04,200,1162694149956603904,thebridge_tech,The Bridge,,...,,,,,,[],,,,
195,1536966880866586626,1536966880866586626,2022-06-15 09:00:34 Hora de verano romance,2022-06-15,09:00:34,200,421798449,vlctechhub,VLCTechHub,,...,,,,,,[],,,,
196,1536745017036918793,1536745017036918793,2022-06-14 18:18:58 Hora de verano romance,2022-06-14,18:18:58,200,1088043591681871872,laproductconfes,La Product Conf & LPCx España,,...,,,,,,[],,,,
197,1536710498712006660,1536710498712006660,2022-06-14 16:01:48 Hora de verano romance,2022-06-14,16:01:48,200,1162694149956603904,thebridge_tech,The Bridge,,...,,,,,,[],,,,


In [38]:
import pandas as pd

In [47]:
df = pd.DataFrame({"a":[1,2,3,4], "b": list("ABCD")})
df

Unnamed: 0,a,b
0,1,A
1,2,B
2,3,C
3,4,D


In [49]:
df.loc[len(df), :] = [5, "E"]

In [50]:
df

Unnamed: 0,a,b
0,1.0,A
1,2.0,B
2,3.0,C
3,4.0,D
4,5.0,E
