## Imports

In [89]:
import pandas as pd
import numpy as np

import nltk

import re

#### Reading the File

In [90]:
df = pd.read_excel('Documents.xlsx')

#### managing twits lengths

In [91]:
df['length'] = [len(x) for x in df['Title']]
max_length = int(np.max(df['length']))

In [92]:
max_length

536

## Preprocessing

#### Managing hyperlinks

In [94]:
def get_hyperlinks(text):
    ## finds all hyperlinks in the text
    return re.findall(r"http\S+", text)

def rmv_hyperlink(text):
    ## removes all hyperlinks of the text
    return re.sub(r"http\S+", "", text)


df['hyperlinks'] = df['Title'].map(
    get_hyperlinks
)

df['Title'] = df['Title'].map(
    rmv_hyperlink
)

#### Managing hashtags

In [104]:
def get_hashtags(text):
    ## finds all hashtags of the text
    return re.findall(r"#\S+", text)

def rmv_hashtags(text):
    ## removes all hashtags of the text
    return re.sub(r"#\S+", "", text)

In [108]:
df['hashtags'] = df['Title'].map(
    get_hashtags
)

df['Title'] = df['Title'].map(
    rmv_hashtags
)

#### Managing user references '@'

In [115]:
def get_users(text):
    return re.findall(r"@\S+", text)

In [116]:
df['referenced_users'] = df['Title'].map(
    get_users
)

#### tokenizing in sentences and PoS

In [124]:
!pip install spacy

Collecting spacy
  Using cached spacy-2.3.2.tar.gz (5.9 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'error'


  ERROR: Command errored out with exit status 1:
   command: 'c:\users\artorias_doge\appdata\local\programs\python\python38-32\python.exe' 'c:\users\artorias_doge\appdata\local\programs\python\python38-32\lib\site-packages\pip' install --ignore-installed --no-user --prefix 'C:\Users\Artorias_Doge\AppData\Local\Temp\pip-build-env-m5id7k2r\overlay' --no-warn-script-location --no-binary :none: --only-binary :none: -i https://pypi.org/simple -- setuptools wheel 'cython>=0.25' 'cymem>=2.0.2,<2.1.0' 'preshed>=3.0.2,<3.1.0' 'murmurhash>=0.28.0,<1.1.0' thinc==7.4.1
       cwd: None
  Complete output (61 lines):
  Collecting setuptools
    Using cached setuptools-50.3.1-py3-none-any.whl (785 kB)
  Collecting wheel
    Using cached wheel-0.35.1-py2.py3-none-any.whl (33 kB)
  Collecting cython>=0.25
    Using cached Cython-0.29.21-cp38-cp38-win32.whl (1.6 MB)
  Collecting cymem<2.1.0,>=2.0.2
    Using cached cymem-2.0.3.tar.gz (51 kB)
  Collecting preshed<3.1.0,>=3.0.2
    Using cached preshed-3.

In [113]:
## french PoS:
## https://github.com/cmchurch/nltk_french/blob/master/french-nltk.py

## https://nlp.stanford.edu/software/CRF-NER.html
    
df['Title'].map(
    nltk.sent_tokenize
)

0              [Il a guéri du COVID ou il a tué Thanos ?]
1       [les premiers symptômes du coronavirus c’est l...
2       [Je souhaite une agréable journée à tout le mo...
3                 [Le COVID va bientôt fêter ses 1ans la]
4       [Le covid il a passé plus de temps que moi à l...
                              ...                        
9995    [[ Le taux de positivité chez les patients sym...
9996    [La @VilledeGrenoble, via le CCAS, a distribué...
9997    [Considérée comme un "cas contact", Roselyne B...
9998    [Bonjour, test covid très douloureux ce matin....
9999    [HAHA\n\nUne main apparaît dans le champ de vi...
Name: Title, Length: 10000, dtype: object

### Creating vocabulary

#### Tokenizing and Removing stopwords

In [44]:
from nltk.corpus import stopwords

stopwords = stopwords.words('french')

def rmv_stop_word(tokens):
    return [x.lower() for x in tokens if x.lower() not in stopwords]

df['Title'] = df['Title'].map(
    nltk.word_tokenize
).map(
    rmv_stop_word
)

#### Stemming

In [45]:
## https://stackoverflow.com/questions/13131139/lemmatize-french-text

from nltk.stem.snowball import FrenchStemmer

stemmer = FrenchStemmer()

def stem(tokens):
    return [stemmer.stem(x) for x in tokens]

df['Title'] = df['Title'].map(
    stem
)

In [46]:
df['Title']

0                      [a, guer, covid, a, tu, thanos, ?]
1       [premi, symptôm, coronavirus, ’, pert, goût, h...
2       [souhait, agréabl, journ, tout, mond, tout, mo...
3                          [covid, va, bientôt, fêt, 1an]
4                  [covid, a, pass, plus, temp, fac, con]
                              ...                        
9995    [[, #, covid19, ], taux, posit, chez, patient,...
9996    [@, villedegrenobl, ,, vi, ccas, ,, a, distrib...
9997    [consider, comm, ``, cas, contact, '', ,, rose...
9998    [bonjour, ,, test, covid, tres, doulour, matin...
9999    [hah, main, apparaît, champ, vision, d'alex, ,...
Name: Title, Length: 10000, dtype: object