In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re

In [3]:
wp_filetypes = 'https://en.wikipedia.org/wiki/List_of_file_formats'

In [4]:
page = requests.get(wp_filetypes)

In [5]:
soup = BeautifulSoup(page.text,'html.parser')

In [6]:
#idenfity the heading tags
#create dict to track starting line of each heading tag
heading_tags = ["h1", "h2", "h3","h4"]
heading_pos = [i.sourceline for i in soup.find_all(heading_tags)]
heading_text = [i.text for i in soup.find_all(heading_tags)]

#identify the content that we are interested in and the related links
#track the line positions of each

#text and position of content
content_pos = [i.sourceline for i in soup.find_all('li')]
content_text = [i.text for i in soup.find_all('li')]

#links
href_pos = [i.sourceline for i in soup.find_all('a',href=True)]
href_link = [i['href'] for i in soup.find_all('a',href=True)]


In [7]:
def get_tag_pos(poslist,contentlist):
    #construct a dict to hold the starting and ending line of each element you are processing
    #input: two lists of same length
    #output: dict of tuples with start and end pos
    poses = {}

    for item in range(len(contentlist)):
        begin = poslist[item]
        
        #create tuple with beginning and ending indexes of each heading
        if item == len(poslist)-1:
            end = content_pos[-1] #if end of heading list take last content tag pos
        else:
            end = poslist[item+1]-1  #next line minus 1
        
        poses[contentlist[item]] = (begin,end)

    return poses

In [8]:
heading_scope = get_tag_pos(heading_pos,heading_text)

In [9]:
content_scope = get_tag_pos(content_pos,content_text)

In [10]:
links_scope = get_tag_pos(href_pos,href_link)

In [11]:
def connect_links(content,links):
    #create dict to connect the text from list content 
    #to relevant tags
    #input: two dicts
    #output: nested dict
    content_links = {}

    for text in content:
        
        #nest dict holds the position of the content and the link
        temp = {}
        temp['pos'] = content[text]
        temp['link'] = []
                    
        for link in links:
            if content[text][0] <= links[link][0] <= content[text][1]:
                temp['link'].append(link)

        content_links[text] = temp
    
    return content_links

In [12]:
files_links = connect_links(content_scope,links_scope)

In [13]:
def join_hdr_content(hdr,content):
    #join the headers with the content (file types and links) based on position
    
    hdr_cont_out = dict(zip(hdr.keys(), ([] for _ in hdr.keys())))
    #hdr_cont_out_lnk = dict(zip(hdr.keys(), ([] for _ in hdr.keys())))

    for h in hdr:
        for c in content:
            #check if the file info tag is at the position of the heading
            if hdr[h][0] <= content[c]['pos'][0] <= hdr[h][1]:
                file_info = []
                file_info.append(c)
                
                if len(content[c]['link']) > 0:
                    file_info.append(content[c]['link'][0])
                else:
                    file_info.append(None)

                hdr_cont_out[h].append(file_info)

    
    #remove unneeded items in the final output
    return  {k: v for k, v in hdr_cont_out.items() if k.endswith('[edit]')}
    
    

In [14]:
final_dict = join_hdr_content(heading_scope,files_links)

In [15]:

def df_from_wpdict(wpdict):
    #create a dataframe for each item in a dict and append to a main df
    #dict -> df
    
    df_cols = ['file_category','file_info','wikilink']
    filetypes = pd.DataFrame(columns=df_cols)
    
    
    #loop through the dict items and append to the df
    for k in wpdict:
        for f in wpdict[k]:
            data = [k,f[0],f[1]]
            df = pd.DataFrame([data],columns=df_cols)
            filetypes = filetypes.append(df)
    
    return filetypes    

In [16]:
df_ft = df_from_wpdict(final_dict)

In [18]:
def clean_fc(value):
    #clean the text in file_category
    return value.replace('[edit]','')

In [19]:
def remove_elrf(df):
    #remove the headings that related to external links and references
    #in the wikipedia article
    return df[~df.file_category.isin(['External links','References','See also'])].reset_index().drop('index',axis=1)


In [20]:
df_ft.file_category = df_ft.file_category.apply(clean_fc)

In [21]:
df_ft_rm = remove_elrf(df_ft)

In [22]:
def clean_links(value):
    #clean up the wikipedia links to just include this from wikipedia
    if value:
        if '/wiki/' in value:
            return 'https://en.wikipedia.org/' + value
        else:
            return None

In [23]:
df_ft_rm.wikilink = df_ft_rm.wikilink.apply(clean_links)

In [24]:
def extract_file_ext(value):
    #extract a file extension from string
    if value:
        if '-' in value:
            return value.split('-')[0]
        if ' – ' in value:
            return value.split(' – ')[0]
        else:
            pass 
        
        
        #else:
        #    re.findall(r'\.\w+',value)[0].replace('.','')


    

In [25]:
df_ft_rm['file_ext'] = df_ft_rm.file_info.apply(extract_file_ext)

In [26]:
dcm = df_ft_rm[df_ft_rm.file_info == 'Digital Imaging and Communications in Medicine (DICOM) (.dcm)'].file_info[847]

In [27]:
df_ft_rm[df_ft_rm.file_info == 'Chemical table file (CTab) (.mol, .sd, .sdf)']

Unnamed: 0,file_category,file_info,wikilink,file_ext
847,Biomedical imaging,Digital Imaging and Communications in Medicine...,https://en.wikipedia.org//wiki/Digital_Imaging...,


In [63]:
cft = df_ft_rm[df_ft_rm.file_info == 'Chemical table file (CTab) (.mol, .sd, .sdf)']

In [31]:
re_filetypes(dcm)

['dcm']

In [33]:
re_filetypes('PostScript (.ps, .ps.gz)')

['ps', 'ps', 'gz']

In [76]:
def re_filetypes(value):
    #function leverages re to pull 
    
    result_dot = re.findall(r'\.\w+',value)
    result_dot = [i.replace('.','') for i in result_dot]


    result_nodot = re.findall(r'\w+',value)
    
    if len(result_dot) > 0:
        return result_dot

    elif len(result_nodot) > 0:
        return result_nodot[0]
        

In [115]:
def apply_re_ft(df):
    #apply the re filetypes and create new rows for each file extension
    #for one row of df
    filter_values = list(df.file_info)

    final_df = df.copy().truncate(after=0)

    for fi in filter_values:
        indf = df[df.file_info == fi]

        outdf = indf.copy()

        value = list(outdf.file_info)[0]
        
        ft_values =  re_filetypes(value)

        if  len(ft_values) > 1:
            
            for item in ft_values:
                tempdf = outdf.copy()
                tempdf.file_ext = item
                outdf = outdf.append(tempdf)

            outdf =  outdf[outdf.file_ext.isna() == False]

        else:
            outdf.file_ext = ft_values[0]

        final_df = final_df.append(outdf)

    return final_df.drop_duplicates()

    



In [70]:
df_ft_rm_nulls = df_ft_rm[df_ft_rm.file_ext.isna()==True]
df_ft_rm_nonnulls = df_ft_rm[~df_ft_rm.file_ext.isna()==True]

In [116]:
test = apply_re_ft(df_ft_rm_nulls)

In [117]:
test

Unnamed: 0,file_category,file_info,wikilink,file_ext
24,Archive and compressed,bzip2 (.bz2) –,https://en.wikipedia.org//wiki/Bzip2,bz2
51,Archive and compressed,HTML (.html) HTML code file,,html
61,Archive and compressed,lzo,https://en.wikipedia.org//wiki/Lzo,l
61,Archive and compressed,lzo,https://en.wikipedia.org//wiki/Lzo,z
61,Archive and compressed,lzo,https://en.wikipedia.org//wiki/Lzo,o
...,...,...,...,...
1307,"Video editing, production",WFP / WVE — Wondershare Filmora Project,,P
1355,Video game data,sometimes used for general data contained with...,,PK3
1405,Video game data,"HE0, HE2, HE4 HE games File",,H
1405,Video game data,"HE0, HE2, HE4 HE games File",,E


In [113]:
print(set(test.file_info))

{'NeXML–XML format for phylogenetic trees', 'PPJ & PRPROJ– Adobe Premiere Pro video editing file', 'Simplified molecular input line entry specification (SMILES) (.smi)', 'Alias (Mac OS)', 'BTM — Batch file', 'GeoJSON –Geographically located data in object notation', 'WFP / WVE — Wondershare Filmora Project', 'Amigaguide', 'Scene description languages (3D vector image formats)', 'PostScript (.ps, .ps.gz)', 'Standard Test Data Format', 'sometimes used for general data contained within the .PK3/PK4 files', 'bzip2 (.bz2) –', 'O — Compiled and optimized C/C++ binary', 'Digital Imaging and Communications in Medicine (DICOM) (.dcm)', 'LZX', 'MPEG (.mpeg, .mpg, .mpe)', 'HTML (.html) HTML code file', 'HE0, HE2, HE4 HE games File', 'RenderMan', 'lzo', 'MP3', 'MPEG Layer 3', 'Chemical table file (CTab) (.mol, .sd, .sdf)', 'Joint Committee on Atomic and Molecular Physical Data (JCAMP) (.dx, .jdx)', 'Medical Imaging NetCDF (MINC) format, previously based on NetCDF; since version 2.0, based on HDF5 

In [114]:
print(list(test[test.file_info == 'Digital Imaging and Communications in Medicine (DICOM) (.dcm)'].file_ext))

['dcm']


In [81]:
df_ft_rm_nulls

Unnamed: 0,file_category,file_info,wikilink,file_ext
24,Archive and compressed,bzip2 (.bz2) –,https://en.wikipedia.org//wiki/Bzip2,
51,Archive and compressed,HTML (.html) HTML code file,,
61,Archive and compressed,lzo,https://en.wikipedia.org//wiki/Lzo,
63,Archive and compressed,LZX,https://en.wikipedia.org//wiki/LZX,
274,Test technology,Standard Test Data Format,https://en.wikipedia.org//wiki/Standard_Test_D...,
369,Document,Amigaguide,https://en.wikipedia.org//wiki/Amigaguide,
426,Document,Troff,https://en.wikipedia.org//wiki/Troff,
474,Geographic information system,GeoJSON –Geographically located data in object...,https://en.wikipedia.org//wiki/GeoJSON,
582,Vector graphics,MOVIE.BYU,,
583,Vector graphics,RenderMan,https://en.wikipedia.org//wiki/RenderMan_Inter...,


In [79]:
apply_re_ft(cft)

Unnamed: 0,file_category,file_info,wikilink,file_ext
810,Chemistry,"Chemical table file (CTab) (.mol, .sd, .sdf)",https://en.wikipedia.org//wiki/Chemical_table_...,mol
810,Chemistry,"Chemical table file (CTab) (.mol, .sd, .sdf)",https://en.wikipedia.org//wiki/Chemical_table_...,sd
810,Chemistry,"Chemical table file (CTab) (.mol, .sd, .sdf)",https://en.wikipedia.org//wiki/Chemical_table_...,sdf


In [37]:
df_dcm = df_ft_rm[df_ft_rm.file_info == 'Digital Imaging and Communications in Medicine (DICOM) (.dcm)']

In [35]:
re_filetypes('expander (see bundle)')

['expander', 'see', 'bundle']

In [185]:
re_filetype_nodot('PCL')

['PCL']

In [180]:
def final_output():
    return df_ft_rm