import urllib.request import csv, re import sys,getopt,codecs import got3 as got from datetime import datetime, timedelta OutputFolderPath='F:/TC_Helper/Data/' def WriteLog(text, SilentMode=False): str=datetime.now().strftime("%H:%M:%S")+ ',' + text.replace(',',';') + '\n' if(not SilentMode): print(str) def GetLastDateAndRefreshIDDFromFile(outputFileName): resumeDate='' RefreshID='' users={} with open(outputFileName, 'r', encoding='ISO-8859-1') as f: flines = f.read().split('\n') if(flines is not None and len(flines)>1): for lines in flines[1:]: AuthorId= str(lines.split(',')[2]) if(AuthorId not in users): users[AuthorId]=1 else: users[AuthorId] +=1 WriteLog(' Existing Authors:' + str(len(users))) cells=flines[len(flines)-1].split(',') resumeDate=cells[5] resumeDate=resumeDate.replace('"','') cells2=flines[1].split(',') RefreshID= 'TWEET-' + cells[0] +'-'+ cells2[0] return len(flines), resumeDate, RefreshID, users ################################################################################################################# #Twit Collector ################################################################################################################# def Start(allwords='', phrase='', skipwords='', anywords='', allhashtags='', anyhashtags='', allfrom='', anyfrom='', allto='', anyto='', allmention='', anymention='', near='', within='', since='', until='', lang='en', toptweets=True, maxtweets='', outputFile=None, caseID='', lastT = '', RefreshID='', users={}): #try: lastTweet=lastT tweetCriteria = got.manager.TweetCriteria() #include all words if(allwords!=''): tweetCriteria.allwords=allwords #include exact single Phrase if(phrase!=''): tweetCriteria.phrase=phrase.split(",") #tweetCriteria.phrase=phrase #none of these words if(skipwords!=''): tweetCriteria.skipwords=skipwords.split(" ") #any of these words if(anywords!=''): tweetCriteria.anywords=anywords.split(" ") #all of these Hashtags if(allhashtags!=''): tweetCriteria.allhashtags=allhashtags.split(" ") #any of these hashtags if(anyhashtags!=''): tweetCriteria.anyhashtags=anyhashtags.split(" ") #from all these users if(allfrom!=''): tweetCriteria.allfrom=allfrom.split(" ") #from any of these users if(anyfrom!=''): tweetCriteria.anyfrom=anyfrom.split(" ") #to all these users if(allto!=''): tweetCriteria.allto=allto.split(" ") #to any of these users if(anyto!=''): tweetCriteria.anyto=anyto.split(" ") #all of these mentions if(allmention!=''): tweetCriteria.allmention=allmention.split(" ") #any of these mentions if(anymention!=''): tweetCriteria.anymention=anymention.split(" ") if(since != ''): tweetCriteria.since = since if(until != ''): if(len(until)>10): until=until[:-6] tweetCriteria.until = until if(toptweets == True): tweetCriteria.topTweets = True if(maxtweets != ''): tweetCriteria.maxTweets = int(maxtweets) if(near != ''): tweetCriteria.near = near if(within != ''): tweetCriteria.within = within if(lang != ''): tweetCriteria.lang = lang def receiveBuffer(tweets): for t in tweets: if(t.AuthorId not in users): users[t.AuthorId]=1 else: users[t.AuthorId] +=1 outputFile.write( ('\n%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s') % (str(t.ID), str(t.ConversationID), str(t.AuthorId), str(t.UserName), str(t.isVerified), str(t.Date.strftime("%Y-%m-%d %H:%M")), t.Language, t.Text, t.Replies, t.Retweets, t.Favorites, t.Mentions, t.Hashtags, t.Permalink, t.URLs, t.isPartOfConversation, t.isReply, t.isRetweet, t.ReplyToUserID, t.ReplyToUserName, t.QuotedTweetID, t.QuotedTweetUser, t.QuotedTweetUserID)) outputFile.flush() WriteLog(' More %d saved on file...' % len(tweets), True) result, lastTweet = got.manager.TweetManager.getTweets(tweetCriteria, receiveBuffer, RefreshID) if(lastTweet is not None): lastT=lastTweet else: if(len(result)>0): lastTweet=result[len(result)-1].date.strftime("%Y-%m-%d %H:%M").date() lastT=lastTweet else: lastTweet=lastT return lastTweet, users # In[13]: import pyodbc import time import threading import os WriteLog("Starting program") def getRequestFromDatabase(trialCount, ResumeDownload=False, appRows={}): for row in appRows: print(row) #try: #Start Downloading twits #arguments={row.allwords, row.phrase, row.skipWords, row.anyWords, row.allHashtags, row.anyHashtags, row.fromallUsers, row.fromanyUsers, row.toallUsers, row.toanyUsers, row.allMentions, row.anyMentions, row.Near, row.within, row.Since, row.Untill, row.lang, row.topTwits, row.maxTwits, OutputFolderPath + '/'+ str(row.caseID) +'.csv', row.caseID} WriteLog("-------------------------------------------------------------------------------") WriteLog("Application - " + str(row.caseID)) if not os.path.exists(OutputFolderPath + '/'+ str(row.caseID) ): os.makedirs(OutputFolderPath + '/'+ str(row.caseID) ) users={} TweetIDs={} lastT= datetime.strptime('1900-01-01 00:00', '%Y-%m-%d %H:%M') trial=trialCount RefreshID='' PrevRefreshID='' Untill = row.Untill action=True outputFileName = OutputFolderPath + '/'+ str(row.caseID) + '/'+ str(trialCount) + '.csv' resumeDate='' header='Tweet ID, Conversation ID, Author Id , Author Name, isVerified, DateTime, Language, Tweet Text, Replies, Retweets, Favorites, Mentions, Hashtags, Permalink, URLs, isPartOfConversation, isReply, isRetweet, Reply To User ID, Reply To User Name, Quoted Tweet ID, Quoted Tweet User Name, Quoted Tweet User ID' if os.path.exists(outputFileName): header='' if(ResumeDownload): records, resumeDate, NoUseRefreshID, users = GetLastDateAndRefreshIDDFromFile(outputFileName) WriteLog(' File Exist with total recrods of '+ str(records) +', starting append mode') if(resumeDate != ''): Untill = resumeDate WriteLog(' Resume download date: '+ str(resumeDate)) outputFile = codecs.open(outputFileName, "a+", "utf-8") outputFile.write(header) failedAttempt=0 failedRefresh=2 while(action): WriteLog(' Chunk - ' +str(trial)) lastTweet, users = Start( row.allwords, row.phrase, row.skipWords, row.anyWords, row.allHashtags, row.anyHashtags, row.fromallUsers, row.fromanyUsers, row.toallUsers, row.toanyUsers, row.allMentions, row.anyMentions, row.Near, str(row.within), row.Since, row.Untill, row.lang, row.topTwits, row.maxTwits, outputFile, str(row.caseID), lastT, RefreshID, users) #outputFile.flush() records, resumeDate, NewRefreshID, us = GetLastDateAndRefreshIDDFromFile(outputFileName) WriteLog(' RefreshID = %s, Until Date = %s, resumeDate = %s, Since Date= %s ' % (str(RefreshID), str(Untill), resumeDate, str(row.Since) )) if(str(resumeDate)==''): resumeDate=Untill; # + ' 00:00' if(':' not in str(resumeDate)): resumeDate = resumeDate + ' 00:00' if((datetime.strptime(resumeDate, '%Y-%m-%d %H:%M') - datetime.strptime(row.Since + ' 00:00', '%Y-%m-%d %H:%M')).days > 1): if(failedAttempt<1): if(NewRefreshID == PrevRefreshID ): print('1 - ' + str(failedAttempt)) failedAttempt +=1 else: print('0 - ' + str(failedAttempt)) failedAttempt=0 failedRefresh=0 action=True RefreshID = NewRefreshID PrevRefreshID=NewRefreshID WriteLog(' Chunk break') WriteLog(' Refresh ID Reset - ' + RefreshID) trial+=1 time.sleep(2) elif(failedAttempt >= 1 and failedRefresh <= 2): RefreshID='' PrevRefreshID=NewRefreshID Untill= str(datetime.strptime(resumeDate, '%Y-%m-%d %H:%M') - timedelta(days=1))[0:10] failedRefresh+=1 failedAttempt=0 action=True WriteLog(' Failed RefreshID, Reseting Until date - '+ Untill) else: #if(failedRefresh>=2): action=False WriteLog(' Failed to retry with Reset Refresh and Until Date for 4 times in total. Until Date before closing: '+ str(row.Untill) ) else: #((datetime.strptime(resumeDate, '%Y-%m-%d %H:%M') - datetime.strptime(row.Since + ' 00:00', '%Y-%m-%d %H:%M')).days <= 1) action=False WriteLog(' Since date Reached (a) - ' + str(row.Since)) outputFile.close() WriteLog(' Current Chunk file generated "%s".' % outputFileName) WriteLog(" Tweets done") #except Exception as ex: # WriteLog("Error: "+ str(ex)) print('Exit') getRequestFromDatabase(0, True, Your_Object_That_Has_all_parameters )