In [22]:
import pandas as pd
import fasttext
import time
from pathlib import Path
from itertools import compress
from googletrans import Translator
from pycountry import languages

In [23]:
translator = Translator() #googletrans

Local location of trained model for language detection. Explained in details here::
https://fasttext.cc/docs/en/language-identification.html

In [24]:
fasttext_model_path=r"C:\Users\Itai\Anaconda3\Lib\site-packages\fasttext\lid.176.bin" 
fasttext_model = fasttext.load_model(fasttext_model_path)



Create functions that will be used in main function:

In [25]:
def check_lang(character,src_lang):
    lang_list=[]
    predictions = fasttext_model.predict(character,k=5,threshold=0.7)[0]
    for pred in predictions:
        pred=pred.replace('__label__','')
        if languages.get(alpha_2=pred) is not None:
            lang_list.append(languages.get(alpha_2=pred).name)
    return src_lang in lang_list

In [26]:
def SplitAndTrim(text,trim=True, split_del='\n'): 
    if trim == True:
        return text.replace(" ","").split(split_del)
    else:
        return text.split(split_del)

Create main function.
This parses json file into csv file that can be imoported into Anki.

In [27]:
def Skype2Anki(Skype_file_path,contact_list,src_lang,trim=True, split_del='\n',start_date='2000-01-01',end_date='2099-01-01'):


    start = time.time()
    #Read conversations column from Skype's Json file, then normalize. I.E Break up Series to df with meaningful columns
    normalized=pd.json_normalize(pd.read_json(Skype_file_path,encoding='utf-8').conversations)
    
    #Convert the MessageList of every contact to df, then concat all of them into one df 
    df=pd.concat(normalized.MessageList[normalized.displayName.isin(contact_list)].apply(lambda x: pd.DataFrame(x)).tolist())
       
    #Convert originalarrivaltime to datetime & filter df by datetime
    df.originalarrivaltime=pd.to_datetime(df.originalarrivaltime)
    df=df[(df['originalarrivaltime'] > start_date) & (df['originalarrivaltime'] < end_date)]
    
    #delete white spaces needs to be only in asian languages
    df['content_trim_split']= df['content'].apply(lambda x: SplitAndTrim(x,trim, split_del))
     #apply the function for every word
    df['is_src_lang']= df['content_trim_split'].apply(lambda x: [check_lang(string,src_lang) for string in x])
    #Delete lists where text is not in the source language
    df['clean_content']= df[['content_trim_split','is_src_lang']].apply(tuple,axis=1).apply(lambda x: list(compress(x[0],x[1])))
    
    # Leave only rows with non empty list. "not x": True for empty list, False for not empty list
    # Split rows with multiple lists to multiple rows (explode)
    fltrd_df=df.clean_content[df['clean_content'].apply(lambda x: not x) ==False].explode().reset_index(drop=True) 
    
    #Transle every row to destination language
    Translation=fltrd_df.apply(lambda x: translator.translate(x).text)
    #Get pronunciation of source language for every row
    Pinyin=fltrd_df.apply(lambda x: translator.translate(x,dest=src_langgoogle_code).pronunciation)
    Anki_CSV=pd.concat([fltrd_df,Translation,Pinyin],axis=1)
    Anki_CSV.columns=['Source','Translation','Pinyin']
    file=expath/'Anki_CSV.txt'
    file.unlink(missing_ok=True) #Delete file if exists
    Anki_CSV.to_csv(file, index=None, mode='a')
    end = time.time()
    runtime= end - start
    print('CSV file created after '+ str(runtime) + ' seconds.\nLocation: ' +str(expath/'Anki_CSV.txt'))

Define variables that will be used with Skype2Anki() function

In [28]:
src_langgoogle_code= 'zh-CN' #code list can be found here: https://cloud.google.com/translate/docs/languages
src_lang='Chinese'
start_date='2000-01-01'
end_date='2099-01-01'
Skype_file_path=Path(r"F:\Dowloads\8_itai.seri_export\messages.json")
expath=Path("F:/Dowloads")
contact_list=['Anne Wu']

Json Skype file before proccesing & cleaning:

In [29]:
print(pd.read_json(Skype_file_path,encoding='utf-8').columns)
pd.read_json(Skype_file_path,encoding='utf-8').head()

Index(['userId', 'exportDate', 'conversations'], dtype='object')


Unnamed: 0,userId,exportDate,conversations
0,8:itai.seri,2020-06-25T03:41,"{'id': '48:calllogs', 'displayName': None, 've..."
1,8:itai.seri,2020-06-25T03:41,"{'id': '8:laiyi131418', 'displayName': 'Ruby',..."
2,8:itai.seri,2020-06-25T03:41,"{'id': '28:concierge', 'displayName': 'Skype',..."
3,8:itai.seri,2020-06-25T03:41,"{'id': '4:+393389369706', 'displayName': None,..."
4,8:itai.seri,2020-06-25T03:41,"{'id': '4:+393337569170', 'displayName': None,..."


Function execution:

In [30]:
Skype2Anki(Skype_file_path,contact_list,src_lang,trim=True, split_del='\n',start_date='2000-01-01',end_date='2099-01-01')

CSV file created after 24.50207257270813 seconds.
Location: F:\Dowloads\Anki_CSV.txt


Result - CSV file ready to be imported into Anki:

In [31]:
pd.read_csv('F:\Dowloads\Anki_CSV.txt')

Unnamed: 0,Source,Translation,Pinyin
0,我去大城市的电影院看电影。,I go to a movie theater in a big city to watch...,Wǒ qù dà chéngshì de diànyǐngyuàn kàn diànyǐng.
1,下车,get off,Xià chē
2,你先坐五路车，然后转十路车。,"You take the No. 5 car first, then transfer to...","Nǐ xiān zuò wǔ lù chē, ránhòu zhuǎn shí lù chē."
3,五路车,Five-way car,Wǔ lù chē
4,我应该坐哪路车？,Which bus should I take?,Wǒ yīnggāi zuò nǎ lù chē?
...,...,...,...
61,米饭,rice,Mǐfàn
62,请来一盘饺子。,Please have a plate of dumplings.,Qǐng lái yī pán jiǎozi.
63,我还没试过中国菜。,I haven't tried Chinese food.,Wǒ hái méi shìguò zhōngguó cài.
64,下课后我开始工作。,I started working after class.,Xiàkè hòu wǒ kāishǐ gōngzuò.
