## 正则表达式

In [1]:
import re

# 查找特定字符
a = re.findall("a", "C|Java|C#|Python|Javascript")
print(a)

['a', 'a', 'a', 'a']


In [3]:
# 匹配单个数字
a = '1C45Java452C#1431254 Python|Javascript'
r = re.findall("\d", a)
print(r)

['1', '4', '5', '4', '5', '2', '1', '4', '3', '1', '2', '5', '4']


In [5]:
# 匹配至少含有1个的连续数字
r = re.findall("\d+", a)
print(r)

['1', '45', '452', '1431254']


In [6]:
# 匹配以1结尾的数字，包含1自己
r = re.findall("\d*1", a)
print(r)

['1', '1431']


In [7]:
# 使用点匹配任意字符, 匹配任意四个字符
r = re.findall("....", a)
print(r)

['1C45', 'Java', '452C', '#143', '1254', ' Pyt', 'hon|', 'Java', 'scri']


In [8]:
# 数字开头的任意两个字符
r = re.findall("\d.", a)
print(r)

['1C', '45', '45', '2C', '14', '31', '25', '4 ']


In [10]:
a = '1C45Java452C#1431254 Python|Javascript'
# 贪婪匹配
# 匹配以J结尾的任意字符，J 是文本中最后出现的 J 只匹配到1个最大化
r = re.findall(".*J", a)
print(r)

['1C45Java452C#1431254 Python|J']


In [11]:
# 非贪婪匹配  匹配到多个以 J 结尾的字符
r = re.findall(".*?J", a)
print(r)

['1C45J', 'ava452C#1431254 Python|J']


In [12]:
# 字符集[]  匹配含有三个字符的，每个字符满足[]中的字符集
r = re.findall("[12][342][1235]", a)
print(r)

['143', '125']


In [14]:
# 匹配所有以字母字符串
a = '1C45Java452C#1431254 Python|Javascript'
r = re.findall("[a-zA-Z]+", a)
print(r)

['C', 'Java', 'C', 'Python', 'Javascript']


In [15]:
# 指定字符串长度{} 4 或者 6  注意4和6直接不能有空格 最大匹配
# ['Java', 'Python', 'Javasc', 'ript']
r = re.findall("[a-zA-Z]{4,6}", a)
print(r)

['Java', 'Python', 'Javasc', 'ript']


In [17]:
# 泛匹配与精确匹配
a = 'Python2Python3'

# 匹配Python和其后面的数字
# ['Python2', 'Python3']
r = re.findall("Python\d", a)
print(r)

['Python2', 'Python3']


In [18]:
# 匹配Python后面的数字
# ['2', '3']
r = re.findall("Python(\d)", a)
print(r)

['2', '3']


In [19]:
# 匹配到元组类型的列表
r = re.findall("(Python)(\d)", a)
print(r)

[('Python', '2'), ('Python', '3')]


In [20]:
# 匹配字符串中以“(”开头的字符
r = re.findall("\(Python\d\)", a)
print(r)

[]


In [16]:
text = """

<div class="celeInfo-right clearfix">
            <div class="movie-brief-container">
      <h1 class="name">亲亲哒</h1>
      <div class="ename ellipsis">Sorrow without Tears</div>
      <ul>
        <li class="ellipsis">
            <a class="text-link" href="/films?catId=1" target="_blank"> 剧情 </a>
            <a class="text-link" href="/films?catId=16" target="_blank"> 家庭 </a>
        </li>
        <li class="ellipsis">
        中国大陆
          / 95分钟
        </li>
        <li class="ellipsis">2020-05-28大陆上映</li>
      </ul>
    </div>
    <div class="action-buyBtn">
      <div class="action clearfix" data-val="{movieid:1208919}">
        <a class="wish " data-wish="false" data-score="" data-bid="b_gbxqtw6x">
          <div>
            <i class="icon wish-icon"></i>
              <span class="wish-msg" data-act="wish-click">想看</span>
          </div>
        </a>
        <a class="score-btn " data-bid="b_rxxpcgwd">
          <div>
            <i class="icon score-btn-icon"></i>
            <span class="score-btn-msg" data-act="comment-open-click">
                评分
            </span>
          </div>
        </a>
      </div>
    </div>

    <div class="movie-stats-container">

        <div class="movie-index">
          <p class="movie-index-title">想看数</p>
          <div class="movie-index-content score normal-score">
              <span class="index-left info-num one-line"><span class="stonefont"></span></span>
          </div>
        </div>

        

        <div class="movie-index">
          <p class="movie-index-title">累计票房</p>
          <div class="movie-index-content box">
              <span class="no-info">(暂无)</span>
          </div>
        </div>
    </div>

      </div>

"""

In [21]:
# re.S表示回车换行继续匹配 ()表示只匹配括号中的内容
# ['想看数', '累计票房']
r = re.findall('<p class="movie-index-title">(.*?)</p>', text, re.S)
print(r)

['想看数', '累计票房']


In [22]:
# ['<p class="movie-index-title">想看数</p>', '<p class="movie-index-title">累计票房</p>']
r = re.findall('<p class="movie-index-title">.*?</p>', text, re.S)
print(r)

['<p class="movie-index-title">想看数</p>', '<p class="movie-index-title">累计票房</p>']


## python中re模块compile与findall

In [42]:
re_telephone = re.compile('^(\d{3})-(\d{3,8})$')
A = re_telephone.match('010-12345').groups()
print(A)

print(re.findall('^(\d{3})-(\d{3,8})$', '010-12345'))

('010', '12345')
[('010', '12345')]


## 社交网络文本的处理

### 使用nltk对社交网络文本处理

In [56]:
from nltk.tokenize import word_tokenize

tweet = 'RT @angelababy: love you baby! :D http://ah.love #168cm'
print(word_tokenize(tweet))

['RT', '@', 'angelababy', ':', 'love', 'you', 'baby', '!', ':', 'D', 'http', ':', '//ah.love', '#', '168cm']


### 使用正则表达式对社交网络文本处理

In [68]:

# ?: 非获取匹配，匹配冒号后的内容但不获取匹配结果，不进行存储供以后使用
import regex as re

tweet = 'RT @angelababy: love you baby! :D http://ah.love #168cm'

regex_str = [
    r'<[^>]+>',     # HTML tags
    r'(?:@[\w_]+)',  # @某人
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # 话题标签
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', #RULs
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # 数字
    r"(?:[a-z][a-z'\-_]+[a-z])", # 含有 - 和 ‘ 的单词
    r'(?:[\w_]+)', # 其他
    r'(?:\S)' # 其他
]

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)

result = tokens_re.findall(tweet)
print(result)


['RT', '@angelababy', ':', 'love', 'you', 'baby', '!', ':', 'D', 'http://ah.love', '#168cm']
