# importing libraries

In [7]:
import aiohttp
import asyncio
import csv
import pandas as pd
from tqdm import tqdm
import json
import math
import re
import emoji
import unicodedata as ud


# Helper functions

In [14]:
def gen_chunks(reader:csv.reader, chunksize:int=1000)->list:
    """yeilds data as cuncks of size 1000 to feed it to the aiohttp library to make requests

    Args:
        reader (csv.reader): a reader object to read the csv
        chunksize (int, optional): the size that should be read for each iteration. Defaults to 1000.

    Yields:
        Iterator[list]: a chunck to be processed
    """   
    chunk = []
    for i, row in enumerate(reader):
        if (i % chunksize == 0 and i > 0):
            yield chunk
            del chunk[:]  # or: chunk = []
        chunk.append(row)
    yield chunk

In [15]:
def filter_tweet(tweet:str)->str:
        """filters the tweet from every unwanted token

        Args:
                tweet (str): a text containing all the tweet

        Returns:
                str: the filtered tweet
        """  
        # a list of all arabic flags, as we want to keep them only and remove any other flag      
        flags={'U+1F1E6',
                'U+1F1E7',
                'U+1F1E9',
                'U+1F1EA',
                'U+1F1EC',
                'U+1F1ED',
                'U+1F1EE',
                'U+1F1EF',
                'U+1F1F0',
                'U+1F1F1',
                'U+1F1F2',
                'U+1F1F3',
                'U+1F1F4',
                'U+1F1F5',
                'U+1F1F6',
                'U+1F1F7',
                'U+1F1F8',
                'U+1F1F9',
                'U+1F1FC',
                'U+1F1FE',
                'U+1F1FF'}

        # remove emojis except for flags
        tweet  =''.join(char for char in tweet if (char in flags or not emoji.is_emoji(char)))
        # remove mentions
        tweet = re.sub("@[A-Za-z0-9_]+", "", tweet)
        # remove links
        tweet = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', tweet)
        # remove hashes and keep the words
        tweet = tweet.replace("#", " ").replace("_", " ")
        # remove all english characters
        tweet = re.sub('[a-zA-Z0-9]+', '', tweet)
        # replace  multiple spaces with one space
        tweet = re.sub(' +', ' ',tweet)
        # remove all punctionations and all digits (arabic and english)
        tweet  =''.join(char for char in tweet if not (
                        ud.category(char).startswith('P') or 
                        ud.category(char).startswith('Nd')))
        return tweet
                        


In [17]:
async def process_chunks(session:aiohttp.ClientSession(), url:str, chunk:list)->list:
    """Takes chunk, and makes post request to the api, then it parses and filters it.

    Args:
        session (aiohttp.ClientSession): An aiohttp client session to request the tweets from
        url (str): the url of the API
        chunk (list): a list containing all the ids to of the tweets to be retrieved

    Returns:
        list: a list of parsed and filtered reponse from the API
    """
    chunk=dict(chunk)
    ids = list(chunk.keys())
    
    response=await session.post(url, json=ids)
    tweets = await response.json()
    rows=[]
    for  aid, tweet in tweets.items():
        rows.append(
        {
            "id":aid,
            "tweet": filter_tweet(tweet),
            "label":chunk[aid]
        })
    return rows

In [18]:
async def download_data(reader:csv.reader,writer:csv.DictWriter)->list:
    """a function that calles the chunk generator and process the data then it saves it in a CSV file   

    Args:
        reader (csv.reader): A CSV reader object of the input CSV
        writer (csv.DictWriter): A csv writer object for the output CSV

    Returns:
        list: a list of all the acquired output from the API
    """
    out=[]
    async with aiohttp.ClientSession() as session:
             
        for chunk in tqdm(gen_chunks(reader, chunksize=1000),total=math.ceil(458198/1000)):
            url = "https://recruitment.aimtechnologies.co/ai-tasks"

            rows =  await process_chunks(session, url, chunk)
            out.extend(rows)
            writer.writerows(rows) 
    return out
        

In [20]:
reader = csv.reader(open('../data/raw/dialect_dataset.csv', 'rt'))
# getting the header first which contains the column labels
next(reader)
# defining the writer of the output file
writer= csv.DictWriter(open('../data/interim/out.csv', 'w'),fieldnames=['id','tweet','label'],delimiter="|") 

# writhing the column names
writer.writeheader()

# calling the function to download the data
out=await download_data(reader,writer)


100%|██████████| 459/459 [09:48<00:00,  1.28s/it]


In [22]:
df=pd.DataFrame(out)
len(df)

458197

In [24]:
pd.read_csv("../data/interim/out.csv",sep="|")

Unnamed: 0,id,tweet,label
0,1175358310087892992,لكن بالنهاية ينتفض يغير,IQ
1,1175416117793349632,يعني هذا محسوب على البشر حيونه ووحشيه وتطلب...,IQ
2,1175450108898565888,مبين من كلامه خليجي,IQ
3,1175471073770573824,يسلملي مرورك وروحك الحلوه,IQ
4,1175496913145217024,وين هل الغيبه اخ محمد,IQ
...,...,...,...
458178,1021088486915559424,مرو خذوني وياكم بالاحمر,BH
458179,1024943651569446784,هاي لو كنت حابه تاكلي شو طلبك,BH
458180,1018588912648904832,الحين نسوي ربيان مشوي حياكم,BH
458181,1024945458576273408,الله يغفر لكم\nمساء الوررررد,BH


# scrap functions (not used in the code rather was used to find the best approach possible) 

In [None]:
% % timeit

for chunk in pd.read_csv("dialect_dataset.csv", chunksize=1000):
    #     print(chunk)
    pass

In [None]:
# %%timeit
reader = csv.reader(open('dialect_dataset.csv', 'rt'))
# getting the header first which contains the column labels
header = next(reader)

for chunk in gen_chunks(reader, chunksize=1000):
    ids, labels = zip(*chunk)
    url = "https://recruitment.aimtechnologies.co/ai-tasks"
    response = process_chunks(url, ids)
    break

In [None]:
%% timeit
reader = csv.reader(open('dialect_dataset.csv', 'rt'))

chunk, chunksize = [], 1000


def process_chunk(chuck):
    #     print len(chuck)
    pass
    # do something useful ...


for i, line in enumerate(reader):
    if (i % chunksize == 0 and i > 0):
        process_chunk(chunk)
        del chunk[:]  # or: chunk = []
    chunk.append(line)

# process the remainder
process_chunk(chunk)