In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
pd.set_option('display.max_rows', 40)

In [2]:
def findCombined(txt):
    myResults = []
    for matchGroup in myDate.finditer(txt):
        myResults.append(matchGroup.group('combined'))
    return myResults

In [3]:
from tqdm import tqdm
tqdm.pandas(tqdm)

In [4]:
nDAY = r'(?:[0-3]?\d)'  # day can be from 1 to 31 with a leading zero 
nMNTH = r'(?:11|12|10|0?[1-9])' # month can be 1 to 12 with a leading zero
nYR = r'(?:(?:19|20)\d\d)'  # I've restricted the year to being in 20th or 21st century on the basis 
                            # that people doon't generally use all number format for old dates, but write them out 
nDELIM = r'(?:[\/\-\._])?'  # 
NUM_DATE = f"""
    (?P<num_date>
        (?:^|\D) # new bit here
        (?:
        # YYYY-MM-DD
        (?:{nYR}(?P<delim1>[\/\-\._]?){nMNTH}(?P=delim1){nDAY})
        |
        # YYYY-DD-MM
        (?:{nYR}(?P<delim2>[\/\-\._]?){nDAY}(?P=delim2){nMNTH})
        |
        # DD-MM-YYYY
        (?:{nDAY}(?P<delim3>[\/\-\._]?){nMNTH}(?P=delim3){nYR})
        |
        # MM-DD-YYYY
        (?:{nMNTH}(?P<delim4>[\/\-\._]?){nDAY}(?P=delim4){nYR})
        )
        (?:\D|$) # new bit here
    )"""
DAY = r"""
(?:
    # search 1st 2nd 3rd etc, or first second third
    (?:[23]?1st|2{1,2}nd|\d{1,2}th|2?3rd|first|second|third|fourth|fifth|sixth|seventh|eighth|nineth)
    |
    # or just a number, but without a leading zero
    (?:[123]?\d)
)"""
MONTH = r'(?:january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)'
YEAR = r"""(?:(?:[12]?\d|')?\d\d)"""
DELIM = r'(?:\s*(?:[\s\.\-\\/,]|(?:of))\s*)'

YEAR_4D = r"""(?:[12]\d\d\d)"""
DATE_PATTERN = f"""(?P<wordy_date>
    # non word character or start of string
    (?:^|\W)
        (?:
            # match various combinations of year month and day 
            (?:
                # 4 digit year
                (?:{YEAR_4D}{DELIM})?
                    (?:
                    # Day - Month
                    (?:{DAY}{DELIM}{MONTH})
                    |
                    # Month - Day
                    (?:{MONTH}{DELIM}{DAY})
                    )
                # 2 or 4 digit year
                (?:{DELIM}{YEAR})?
            )
            |
            # Month - Year (2 or 3 digit)
            (?:{MONTH}{DELIM}{YEAR})
        )
    # non-word character or end of string
    (?:$|\W)
)"""

TIME = r"""(?:
(?:
# first number should be 0 - 59 with optional leading zero.
[012345]?\d
# second number is the same following a colon
:[012345]\d
)
# next we add our optional seconds number in the same format
(?::[012345]\d)?
# and finally add optional am or pm possibly with . and spaces
(?:\s*(?:a|p)\.?m\.?)?
)"""

COMBINED = f"""(?P<combined>
    (?:
        # time followed by date, or date followed by time
        {TIME}?{DATE_PATTERN}{TIME}?
        |
        # or as above but with the numeric version of the date
        {TIME}?{NUM_DATE}{TIME}?
    ) 
    # or a time on its own
    |
    (?:{TIME})
)"""

myDate = re.compile(COMBINED, re.IGNORECASE | re.VERBOSE | re.UNICODE)

In [5]:
for comments in train,test:
    # remove '\\n'
    comments['comment_text'] = comments['comment_text'].map(lambda x: re.sub('\\n',' ',str(x)))
    
    # remove any text starting with User... 
    comments['comment_text'] = comments['comment_text'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    
    # remove IP addresses or user IDs
    comments['comment_text'] = comments['comment_text'].map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    
    # lower uppercase letters
    comments['comment_text'] = comments['comment_text'].map(lambda x: str(x).lower())
    
    #remove http links in the text
    comments['comment_text'] = comments['comment_text'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)",'',str(x)))
    
    #remove all punctuation except for apostrophe (')
    #comments['comment_text'] = comments['comment_text'].map(lambda x: re.sub('[!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~]','',str(x)))

In [6]:
for df in [train, test]:
    df['fewer_dates'] = df.comment_text.progress_apply(lambda x: myDate.sub(' xxDATExx ', x))

100%|████████████████████████████████████████████████████████████████████████| 159571/159571 [00:52<00:00, 3063.19it/s]
100%|████████████████████████████████████████████████████████████████████████| 153164/153164 [00:47<00:00, 3193.25it/s]


In [7]:
print('Found {} rows with dates in the training set'.format(train.fewer_dates.str.contains('xxDATExx').sum()))
print('Found {} rows with dates in the test set'.format(test.fewer_dates.str.contains('xxDATExx').sum()))

Found 12667 rows with dates in the training set
Found 9098 rows with dates in the test set


In [8]:
pd.set_option('display.max_colwidth', -1)
train.loc[train.fewer_dates.str.contains('xxDATExx'), ['comment_text', 'fewer_dates']].head()

Unnamed: 0,comment_text,fewer_dates
1,"d'aww! he matches this background colour i'm seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)","d'aww! he matches this background colour i'm seemingly stuck with. thanks. (talk) xxDATExx , xxDATExx (utc)"
10,""" fair use rationale for image:wonju.jpg thanks for uploading image:wonju.jpg. i notice the image page specifies that the image is being used under fair use but there is no explanation or rationale as to why its use in wikipedia articles constitutes fair use. in addition to the boilerplate fair use template, you must also write out on the image description page a specific explanation or rationale for why using this image in each article is consistent with fair use. please go to the image description page and edit it to include a fair use rationale. if you have uploaded other fair use media, consider checking that you have specified the fair use rationale on those pages too. you can find a list of 'image' pages you have edited by clicking on the """"my contributions"""" link (it is located at the very top of any wikipedia page when you are logged in), and then selecting """"image"""" from the dropdown box. note that any fair use images uploaded after 4 may, 2006, and lacking such an explanation will be deleted one week after they have been uploaded, as described on criteria for speedy deletion. if you have any questions please ask them at the media copyright questions page. thank you. (talk • contribs • ) unspecified source for image:wonju.jpg thanks for uploading image:wonju.jpg. i noticed that the file's description page currently doesn't specify who created the content, so the copyright status is unclear. if you did not create this file yourself, then you will need to specify the owner of the copyright. if you obtained it from a website, then a link to the website from which it was taken, together with a restatement of that website's terms of use of its content, is usually sufficient information. however, if the copyright holder is different from the website's publisher, then their copyright should also be acknowledged. as well as adding the source, please add a proper copyright licensing tag if the file doesn't have one already. if you created/took the picture, audio, or video then the tag can be used to release it under the gfdl. if you believe the media meets the criteria at wikipedia:fair use, use a tag such as or one of the other tags listed at wikipedia:image copyright tags#fair use. see wikipedia:image copyright tags for the full list of copyright tags that you can use. if you have uploaded other files, consider checking that you have specified their source and tagged them, too. you can find a list of files you have uploaded by following [ this link]. unsourced and untagged images may be deleted one week after they have been tagged, as described on criteria for speedy deletion. if the image is copyrighted under a non-free license (per wikipedia:fair use) then the image will be deleted 48 hours after . if you have any questions please ask them at the media copyright questions page. thank you. (talk • contribs • ) """,""" fair use rationale for image:wonju.jpg thanks for uploading image:wonju.jpg. i notice the image page specifies that the image is being used under fair use but there is no explanation or rationale as to why its use in wikipedia articles constitutes fair use. in addition to the boilerplate fair use template, you must also write out on the image description page a specific explanation or rationale for why using this image in each article is consistent with fair use. please go to the image description page and edit it to include a fair use rationale. if you have uploaded other fair use media, consider checking that you have specified the fair use rationale on those pages too. you can find a list of 'image' pages you have edited by clicking on the """"my contributions"""" link (it is located at the very top of any wikipedia page when you are logged in), and then selecting """"image"""" from the dropdown box. note that any fair use images uploaded after xxDATExx and lacking such an explanation will be deleted one week after they have been uploaded, as described on criteria for speedy deletion. if you have any questions please ask them at the media copyright questions page. thank you. (talk • contribs • ) unspecified source for image:wonju.jpg thanks for uploading image:wonju.jpg. i noticed that the file's description page currently doesn't specify who created the content, so the copyright status is unclear. if you did not create this file yourself, then you will need to specify the owner of the copyright. if you obtained it from a website, then a link to the website from which it was taken, together with a restatement of that website's terms of use of its content, is usually sufficient information. however, if the copyright holder is different from the website's publisher, then their copyright should also be acknowledged. as well as adding the source, please add a proper copyright licensing tag if the file doesn't have one already. if you created/took the picture, audio, or video then the tag can be used to release it under the gfdl. if you believe the media meets the criteria at wikipedia:fair use, use a tag such as or one of the other tags listed at wikipedia:image copyright tags#fair use. see wikipedia:image copyright tags for the full list of copyright tags that you can use. if you have uploaded other files, consider checking that you have specified their source and tagged them, too. you can find a list of files you have uploaded by following [ this link]. unsourced and untagged images may be deleted one week after they have been tagged, as described on criteria for speedy deletion. if the image is copyrighted under a non-free license (per wikipedia:fair use) then the image will be deleted 48 hours after . if you have any questions please ask them at the media copyright questions page. thank you. (talk • contribs • ) """
15,""" juelz santanas age in 2002, juelz santana was 18 years old, then came february 18th, which makes juelz turn 19 making songs with the diplomats. the third neff to be signed to cam's label under roc a fella. in 2003, he was 20 years old coming out with his own singles """"santana's town"""" and """"down"""". so yes, he is born in 1983. he really is, how could he be older then lloyd banks? and how could he be 22 when his birthday passed? the homie neff is 23 years old. 1983 - 2006 (juelz death, god forbid if your thinking about that) equals 23. go to your caculator and stop changing his year of birth. my god.""",""" juelz santanas age in 2002, juelz santana was 18 years old, then came xxDATExx which makes juelz turn 19 making songs with the diplomats. the third neff to be signed to cam's label under roc a fella. in 2003, he was 20 years old coming out with his own singles """"santana's town"""" and """"down"""". so yes, he is born in 1983. he really is, how could he be older then lloyd banks? and how could he be 22 when his birthday passed? the homie neff is 23 years old. 1983 - 2006 (juelz death, god forbid if your thinking about that) equals 23. go to your caculator and stop changing his year of birth. my god."""
23,""" the signpost: 24 september 2012 read this signpost in full single-page unsubscribe """,""" the signpost: xxDATExx read this signpost in full single-page unsubscribe """
27,"yes, because the mother of the child in the case against michael jackson was studied in here motives and reasonings and judged upon her character just as harshly as wacko jacko himself. don't tell me to ignore it and incriminate myself. i am going to continue refuting the bullshit that jayjg keeps throwing at me. 18:01, 16 jun 2005 (utc)","yes, because the mother of the child in the case against michael jackson was studied in here motives and reasonings and judged upon her character just as harshly as wacko jacko himself. don't tell me to ignore it and incriminate myself. i am going to continue refuting the bullshit that jayjg keeps throwing at me. xxDATExx , xxDATExx (utc)"


In [9]:
train.to_csv('data/train_nodate.csv', index=False)
test.to_csv('data/test_nodate.csv', index=False)